1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. Rights for redistribution and usage in source and binary 6 # forms are granted according to the OpenSSL license. 7 # ==================================================================== 8 # 9 # sha256/512_block procedure for x86_64. 10 # 11 # 40% improvement over compiler-generated code on Opteron. On EM64T 12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical 13 # tricks, just straight implementation... I really wonder why gcc 14 # [being armed with inline assembler] fails to generate as fast code. 15 # The only thing which is cool about this module is that it's very 16 # same instruction sequence used for both SHA-256 and SHA-512. In 17 # former case the instructions operate on 32-bit operands, while in 18 # latter - on 64-bit ones. All I had to do is to get one flavor right, 19 # the other one passed the test right away:-) 20 # 21 # sha256_block runs in ~1005 cycles on Opteron, which gives you 22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results 24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement? 25 # Well, if you compare it to IA-64 implementation, which maintains 26 # X[16] in register bank[!], tends to 4 instructions per CPU clock 27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way 28 # issue Opteron pipeline and X[16] maintained in memory. So that *if* 29 # there is a way to improve it, *then* the only way would be to try to 30 # offload X[16] updates to SSE unit, but that would require "deeper" 31 # loop unroll, which in turn would naturally cause size blow-up, not 32 # to mention increased complexity! And once again, only *if* it's 33 # actually possible to noticeably improve overall ILP, instruction 34 # level parallelism, on a given CPU implementation in this case. 35 # 36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect 37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], 38 # [currently available] EM64T CPUs apparently are far from it. On the 39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 40 # sha256_block:-( This is presumably because 64-bit shifts/rotates 41 # apparently are not atomic instructions, but implemented in microcode. 42 # 43 # May 2012. 44 # 45 # Optimization including one of Pavel Semjanov's ideas, alternative 46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 47 # unfortunately -2% SHA512 on P4 [which nobody should care about 48 # that much]. 49 # 50 # June 2012. 51 # 52 # Add SIMD code paths, see below for improvement coefficients. SSSE3 53 # code path was not attempted for SHA512, because improvement is not 54 # estimated to be high enough, noticeably less than 9%, to justify 55 # the effort, not on pre-AVX processors. [Obviously with exclusion 56 # for VIA Nano, but it has SHA512 instruction that is faster and 57 # should be used instead.] For reference, corresponding estimated 58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 59 # higher coefficients are observed on VIA Nano and Bulldozer has more 60 # to do with specifics of their architecture [which is topic for 61 # separate discussion]. 62 # 63 # November 2012. 64 # 65 # Add AVX2 code path. Two consecutive input blocks are loaded to 66 # 256-bit %ymm registers, with data from first block to least 67 # significant 128-bit halves and data from second to most significant. 68 # The data is then processed with same SIMD instruction sequence as 69 # for AVX, but with %ymm as operands. Side effect is increased stack 70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 71 # code size increase. 72 # 73 # March 2014. 74 # 75 # Add support for Intel SHA Extensions. 76 77 ###################################################################### 78 # Current performance in cycles per processed byte (less is better): 79 # 80 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 81 # 82 # AMD K8 14.9 - - 9.57 - 83 # P4 17.3 - - 30.8 - 84 # Core 2 15.6 13.8(+13%) - 9.97 - 85 # Westmere 14.8 12.3(+19%) - 9.58 - 86 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 87 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 88 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 89 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 90 # VIA Nano 23.0 16.5(+39%) - 14.7 - 91 # Atom 23.0 18.9(+22%) - 14.7 - 92 # Silvermont 27.4 20.6(+33%) - 17.5 - 93 # 94 # (*) whichever best applicable; 95 # (**) switch from ror to shrd stands for fair share of improvement; 96 # (***) execution time is fully determined by remaining integer-only 97 # part, body_00_15; reducing the amount of SIMD instructions 98 # below certain limit makes no difference/sense; to conserve 99 # space SHA256 XOP code path is therefore omitted; 100 101 $flavour = shift; 102 $output = shift; 103 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 104 105 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 106 107 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 108 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 109 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 110 die "can't locate x86_64-xlate.pl"; 111 112 # In upstream, this is controlled by shelling out to the compiler to check 113 # versions, but BoringSSL is intended to be used with pre-generated perlasm 114 # output, so this isn't useful anyway. 115 # 116 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it 117 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream 118 # did not tie them together until after $shaext was added. 119 $avx = 1; 120 121 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 122 # been tested. 123 $shaext=0; ### set to zero if compiling for 1.0.1 124 $avx=1 if (!$shaext && $avx); 125 126 open OUT,"| \"$^X\" $xlate $flavour"; 127 *STDOUT=*OUT; 128 129 if ($output =~ /512/) { 130 $func="sha512_block_data_order"; 131 $TABLE="K512"; 132 $SZ=8; 133 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 134 "%r8", "%r9", "%r10","%r11"); 135 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 136 @Sigma0=(28,34,39); 137 @Sigma1=(14,18,41); 138 @sigma0=(1, 8, 7); 139 @sigma1=(19,61, 6); 140 $rounds=80; 141 } else { 142 $func="sha256_block_data_order"; 143 $TABLE="K256"; 144 $SZ=4; 145 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 146 "%r8d","%r9d","%r10d","%r11d"); 147 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 148 @Sigma0=( 2,13,22); 149 @Sigma1=( 6,11,25); 150 @sigma0=( 7,18, 3); 151 @sigma1=(17,19,10); 152 $rounds=64; 153 } 154 155 $ctx="%rdi"; # 1st arg, zapped by $a3 156 $inp="%rsi"; # 2nd arg 157 $Tbl="%rbp"; 158 159 $_ctx="16*$SZ+0*8(%rsp)"; 160 $_inp="16*$SZ+1*8(%rsp)"; 161 $_end="16*$SZ+2*8(%rsp)"; 162 $_rsp="16*$SZ+3*8(%rsp)"; 163 $framesz="16*$SZ+4*8"; 164 165 166 sub ROUND_00_15() 167 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 168 my $STRIDE=$SZ; 169 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 170 171 $code.=<<___; 172 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 173 mov $f,$a2 174 175 xor $e,$a0 176 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 177 xor $g,$a2 # f^g 178 179 mov $T1,`$SZ*($i&0xf)`(%rsp) 180 xor $a,$a1 181 and $e,$a2 # (f^g)&e 182 183 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 184 add $h,$T1 # T1+=h 185 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 186 187 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 188 xor $e,$a0 189 add $a2,$T1 # T1+=Ch(e,f,g) 190 191 mov $a,$a2 192 add ($Tbl),$T1 # T1+=K[round] 193 xor $a,$a1 194 195 xor $b,$a2 # a^b, b^c in next round 196 ror \$$Sigma1[0],$a0 # Sigma1(e) 197 mov $b,$h 198 199 and $a2,$a3 200 ror \$$Sigma0[0],$a1 # Sigma0(a) 201 add $a0,$T1 # T1+=Sigma1(e) 202 203 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 204 add $T1,$d # d+=T1 205 add $T1,$h # h+=T1 206 207 lea $STRIDE($Tbl),$Tbl # round++ 208 ___ 209 $code.=<<___ if ($i<15); 210 add $a1,$h # h+=Sigma0(a) 211 ___ 212 ($a2,$a3) = ($a3,$a2); 213 } 214 215 sub ROUND_16_XX() 216 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 217 218 $code.=<<___; 219 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 220 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 221 222 mov $a0,$T1 223 ror \$`$sigma0[1]-$sigma0[0]`,$a0 224 add $a1,$a # modulo-scheduled h+=Sigma0(a) 225 mov $a2,$a1 226 ror \$`$sigma1[1]-$sigma1[0]`,$a2 227 228 xor $T1,$a0 229 shr \$$sigma0[2],$T1 230 ror \$$sigma0[0],$a0 231 xor $a1,$a2 232 shr \$$sigma1[2],$a1 233 234 ror \$$sigma1[0],$a2 235 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 236 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 237 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 238 239 add `$SZ*($i&0xf)`(%rsp),$T1 240 mov $e,$a0 241 add $a2,$T1 242 mov $a,$a1 243 ___ 244 &ROUND_00_15(@_); 245 } 246 247 $code=<<___; 248 .text 249 250 .extern OPENSSL_ia32cap_P 251 .globl $func 252 .type $func,\@function,3 253 .align 16 254 $func: 255 ___ 256 $code.=<<___ if ($SZ==4 || $avx); 257 lea OPENSSL_ia32cap_P(%rip),%r11 258 mov 0(%r11),%r9d 259 mov 4(%r11),%r10d 260 mov 8(%r11),%r11d 261 ___ 262 $code.=<<___ if ($SZ==4 && $shaext); 263 test \$`1<<29`,%r11d # check for SHA 264 jnz _shaext_shortcut 265 ___ 266 $code.=<<___ if ($avx && $SZ==8); 267 test \$`1<<11`,%r10d # check for XOP 268 jnz .Lxop_shortcut 269 ___ 270 $code.=<<___ if ($avx>1); 271 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 272 cmp \$`1<<8|1<<5|1<<3`,%r11d 273 je .Lavx2_shortcut 274 ___ 275 $code.=<<___ if ($avx); 276 and \$`1<<30`,%r9d # mask "Intel CPU" bit 277 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 278 or %r9d,%r10d 279 cmp \$`1<<28|1<<9|1<<30`,%r10d 280 je .Lavx_shortcut 281 ___ 282 $code.=<<___ if ($SZ==4); 283 test \$`1<<9`,%r10d 284 jnz .Lssse3_shortcut 285 ___ 286 $code.=<<___; 287 push %rbx 288 push %rbp 289 push %r12 290 push %r13 291 push %r14 292 push %r15 293 mov %rsp,%r11 # copy %rsp 294 shl \$4,%rdx # num*16 295 sub \$$framesz,%rsp 296 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 297 and \$-64,%rsp # align stack frame 298 mov $ctx,$_ctx # save ctx, 1st arg 299 mov $inp,$_inp # save inp, 2nd arh 300 mov %rdx,$_end # save end pointer, "3rd" arg 301 mov %r11,$_rsp # save copy of %rsp 302 .Lprologue: 303 304 mov $SZ*0($ctx),$A 305 mov $SZ*1($ctx),$B 306 mov $SZ*2($ctx),$C 307 mov $SZ*3($ctx),$D 308 mov $SZ*4($ctx),$E 309 mov $SZ*5($ctx),$F 310 mov $SZ*6($ctx),$G 311 mov $SZ*7($ctx),$H 312 jmp .Lloop 313 314 .align 16 315 .Lloop: 316 mov $B,$a3 317 lea $TABLE(%rip),$Tbl 318 xor $C,$a3 # magic 319 ___ 320 for($i=0;$i<16;$i++) { 321 $code.=" mov $SZ*$i($inp),$T1\n"; 322 $code.=" mov @ROT[4],$a0\n"; 323 $code.=" mov @ROT[0],$a1\n"; 324 $code.=" bswap $T1\n"; 325 &ROUND_00_15($i,@ROT); 326 unshift(@ROT,pop(@ROT)); 327 } 328 $code.=<<___; 329 jmp .Lrounds_16_xx 330 .align 16 331 .Lrounds_16_xx: 332 ___ 333 for(;$i<32;$i++) { 334 &ROUND_16_XX($i,@ROT); 335 unshift(@ROT,pop(@ROT)); 336 } 337 338 $code.=<<___; 339 cmpb \$0,`$SZ-1`($Tbl) 340 jnz .Lrounds_16_xx 341 342 mov $_ctx,$ctx 343 add $a1,$A # modulo-scheduled h+=Sigma0(a) 344 lea 16*$SZ($inp),$inp 345 346 add $SZ*0($ctx),$A 347 add $SZ*1($ctx),$B 348 add $SZ*2($ctx),$C 349 add $SZ*3($ctx),$D 350 add $SZ*4($ctx),$E 351 add $SZ*5($ctx),$F 352 add $SZ*6($ctx),$G 353 add $SZ*7($ctx),$H 354 355 cmp $_end,$inp 356 357 mov $A,$SZ*0($ctx) 358 mov $B,$SZ*1($ctx) 359 mov $C,$SZ*2($ctx) 360 mov $D,$SZ*3($ctx) 361 mov $E,$SZ*4($ctx) 362 mov $F,$SZ*5($ctx) 363 mov $G,$SZ*6($ctx) 364 mov $H,$SZ*7($ctx) 365 jb .Lloop 366 367 mov $_rsp,%rsi 368 mov (%rsi),%r15 369 mov 8(%rsi),%r14 370 mov 16(%rsi),%r13 371 mov 24(%rsi),%r12 372 mov 32(%rsi),%rbp 373 mov 40(%rsi),%rbx 374 lea 48(%rsi),%rsp 375 .Lepilogue: 376 ret 377 .size $func,.-$func 378 ___ 379 380 if ($SZ==4) { 381 $code.=<<___; 382 .align 64 383 .type $TABLE,\@object 384 $TABLE: 385 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 386 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 387 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 388 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 389 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 390 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 391 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 392 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 393 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 394 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 395 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 396 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 397 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 398 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 399 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 400 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 401 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 402 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 403 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 404 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 405 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 406 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 407 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 408 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 409 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 410 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 411 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 412 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 413 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 414 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 415 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 416 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 417 418 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 419 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 420 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 421 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 422 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 423 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 424 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 425 ___ 426 } else { 427 $code.=<<___; 428 .align 64 429 .type $TABLE,\@object 430 $TABLE: 431 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 432 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 433 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 434 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 435 .quad 0x3956c25bf348b538,0x59f111f1b605d019 436 .quad 0x3956c25bf348b538,0x59f111f1b605d019 437 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 438 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 439 .quad 0xd807aa98a3030242,0x12835b0145706fbe 440 .quad 0xd807aa98a3030242,0x12835b0145706fbe 441 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 442 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 443 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 444 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 445 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 446 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 447 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 448 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 449 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 450 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 451 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 452 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 453 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 454 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 455 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 456 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 457 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 458 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 459 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 460 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 461 .quad 0x06ca6351e003826f,0x142929670a0e6e70 462 .quad 0x06ca6351e003826f,0x142929670a0e6e70 463 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 464 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 465 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 466 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 467 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 468 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 469 .quad 0x81c2c92e47edaee6,0x92722c851482353b 470 .quad 0x81c2c92e47edaee6,0x92722c851482353b 471 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 472 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 473 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 474 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 475 .quad 0xd192e819d6ef5218,0xd69906245565a910 476 .quad 0xd192e819d6ef5218,0xd69906245565a910 477 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 478 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 479 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 480 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 481 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 482 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 483 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 484 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 485 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 486 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 487 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 488 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 489 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 490 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 491 .quad 0x90befffa23631e28,0xa4506cebde82bde9 492 .quad 0x90befffa23631e28,0xa4506cebde82bde9 493 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 494 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 495 .quad 0xca273eceea26619c,0xd186b8c721c0c207 496 .quad 0xca273eceea26619c,0xd186b8c721c0c207 497 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 498 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 499 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 500 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 501 .quad 0x113f9804bef90dae,0x1b710b35131c471b 502 .quad 0x113f9804bef90dae,0x1b710b35131c471b 503 .quad 0x28db77f523047d84,0x32caab7b40c72493 504 .quad 0x28db77f523047d84,0x32caab7b40c72493 505 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 506 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 507 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 508 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 509 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 510 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 511 512 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 513 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 514 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 515 ___ 516 } 517 518 ###################################################################### 519 # SIMD code paths 520 # 521 if ($SZ==4 && $shaext) {{{ 522 ###################################################################### 523 # Intel SHA Extensions implementation of SHA256 update function. 524 # 525 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 526 527 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 528 my @MSG=map("%xmm$_",(3..6)); 529 530 $code.=<<___; 531 .type sha256_block_data_order_shaext,\@function,3 532 .align 64 533 sha256_block_data_order_shaext: 534 _shaext_shortcut: 535 ___ 536 $code.=<<___ if ($win64); 537 lea `-8-5*16`(%rsp),%rsp 538 movaps %xmm6,-8-5*16(%rax) 539 movaps %xmm7,-8-4*16(%rax) 540 movaps %xmm8,-8-3*16(%rax) 541 movaps %xmm9,-8-2*16(%rax) 542 movaps %xmm10,-8-1*16(%rax) 543 .Lprologue_shaext: 544 ___ 545 $code.=<<___; 546 lea K256+0x80(%rip),$Tbl 547 movdqu ($ctx),$ABEF # DCBA 548 movdqu 16($ctx),$CDGH # HGFE 549 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 550 551 pshufd \$0x1b,$ABEF,$Wi # ABCD 552 pshufd \$0xb1,$ABEF,$ABEF # CDAB 553 pshufd \$0x1b,$CDGH,$CDGH # EFGH 554 movdqa $TMP,$BSWAP # offload 555 palignr \$8,$CDGH,$ABEF # ABEF 556 punpcklqdq $Wi,$CDGH # CDGH 557 jmp .Loop_shaext 558 559 .align 16 560 .Loop_shaext: 561 movdqu ($inp),@MSG[0] 562 movdqu 0x10($inp),@MSG[1] 563 movdqu 0x20($inp),@MSG[2] 564 pshufb $TMP,@MSG[0] 565 movdqu 0x30($inp),@MSG[3] 566 567 movdqa 0*32-0x80($Tbl),$Wi 568 paddd @MSG[0],$Wi 569 pshufb $TMP,@MSG[1] 570 movdqa $CDGH,$CDGH_SAVE # offload 571 sha256rnds2 $ABEF,$CDGH # 0-3 572 pshufd \$0x0e,$Wi,$Wi 573 nop 574 movdqa $ABEF,$ABEF_SAVE # offload 575 sha256rnds2 $CDGH,$ABEF 576 577 movdqa 1*32-0x80($Tbl),$Wi 578 paddd @MSG[1],$Wi 579 pshufb $TMP,@MSG[2] 580 sha256rnds2 $ABEF,$CDGH # 4-7 581 pshufd \$0x0e,$Wi,$Wi 582 lea 0x40($inp),$inp 583 sha256msg1 @MSG[1],@MSG[0] 584 sha256rnds2 $CDGH,$ABEF 585 586 movdqa 2*32-0x80($Tbl),$Wi 587 paddd @MSG[2],$Wi 588 pshufb $TMP,@MSG[3] 589 sha256rnds2 $ABEF,$CDGH # 8-11 590 pshufd \$0x0e,$Wi,$Wi 591 movdqa @MSG[3],$TMP 592 palignr \$4,@MSG[2],$TMP 593 nop 594 paddd $TMP,@MSG[0] 595 sha256msg1 @MSG[2],@MSG[1] 596 sha256rnds2 $CDGH,$ABEF 597 598 movdqa 3*32-0x80($Tbl),$Wi 599 paddd @MSG[3],$Wi 600 sha256msg2 @MSG[3],@MSG[0] 601 sha256rnds2 $ABEF,$CDGH # 12-15 602 pshufd \$0x0e,$Wi,$Wi 603 movdqa @MSG[0],$TMP 604 palignr \$4,@MSG[3],$TMP 605 nop 606 paddd $TMP,@MSG[1] 607 sha256msg1 @MSG[3],@MSG[2] 608 sha256rnds2 $CDGH,$ABEF 609 ___ 610 for($i=4;$i<16-3;$i++) { 611 $code.=<<___; 612 movdqa $i*32-0x80($Tbl),$Wi 613 paddd @MSG[0],$Wi 614 sha256msg2 @MSG[0],@MSG[1] 615 sha256rnds2 $ABEF,$CDGH # 16-19... 616 pshufd \$0x0e,$Wi,$Wi 617 movdqa @MSG[1],$TMP 618 palignr \$4,@MSG[0],$TMP 619 nop 620 paddd $TMP,@MSG[2] 621 sha256msg1 @MSG[0],@MSG[3] 622 sha256rnds2 $CDGH,$ABEF 623 ___ 624 push(@MSG,shift(@MSG)); 625 } 626 $code.=<<___; 627 movdqa 13*32-0x80($Tbl),$Wi 628 paddd @MSG[0],$Wi 629 sha256msg2 @MSG[0],@MSG[1] 630 sha256rnds2 $ABEF,$CDGH # 52-55 631 pshufd \$0x0e,$Wi,$Wi 632 movdqa @MSG[1],$TMP 633 palignr \$4,@MSG[0],$TMP 634 sha256rnds2 $CDGH,$ABEF 635 paddd $TMP,@MSG[2] 636 637 movdqa 14*32-0x80($Tbl),$Wi 638 paddd @MSG[1],$Wi 639 sha256rnds2 $ABEF,$CDGH # 56-59 640 pshufd \$0x0e,$Wi,$Wi 641 sha256msg2 @MSG[1],@MSG[2] 642 movdqa $BSWAP,$TMP 643 sha256rnds2 $CDGH,$ABEF 644 645 movdqa 15*32-0x80($Tbl),$Wi 646 paddd @MSG[2],$Wi 647 nop 648 sha256rnds2 $ABEF,$CDGH # 60-63 649 pshufd \$0x0e,$Wi,$Wi 650 dec $num 651 nop 652 sha256rnds2 $CDGH,$ABEF 653 654 paddd $CDGH_SAVE,$CDGH 655 paddd $ABEF_SAVE,$ABEF 656 jnz .Loop_shaext 657 658 pshufd \$0xb1,$CDGH,$CDGH # DCHG 659 pshufd \$0x1b,$ABEF,$TMP # FEBA 660 pshufd \$0xb1,$ABEF,$ABEF # BAFE 661 punpckhqdq $CDGH,$ABEF # DCBA 662 palignr \$8,$TMP,$CDGH # HGFE 663 664 movdqu $ABEF,($ctx) 665 movdqu $CDGH,16($ctx) 666 ___ 667 $code.=<<___ if ($win64); 668 movaps -8-5*16(%rax),%xmm6 669 movaps -8-4*16(%rax),%xmm7 670 movaps -8-3*16(%rax),%xmm8 671 movaps -8-2*16(%rax),%xmm9 672 movaps -8-1*16(%rax),%xmm10 673 mov %rax,%rsp 674 .Lepilogue_shaext: 675 ___ 676 $code.=<<___; 677 ret 678 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 679 ___ 680 }}} 681 {{{ 682 683 my $a4=$T1; 684 my ($a,$b,$c,$d,$e,$f,$g,$h); 685 686 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 687 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 688 my $arg = pop; 689 $arg = "\$$arg" if ($arg*1 eq $arg); 690 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 691 } 692 693 sub body_00_15 () { 694 ( 695 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 696 697 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 698 '&mov ($a,$a1)', 699 '&mov ($a4,$f)', 700 701 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 702 '&xor ($a0,$e)', 703 '&xor ($a4,$g)', # f^g 704 705 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 706 '&xor ($a1,$a)', 707 '&and ($a4,$e)', # (f^g)&e 708 709 '&xor ($a0,$e)', 710 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 711 '&mov ($a2,$a)', 712 713 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 714 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 715 '&xor ($a2,$b)', # a^b, b^c in next round 716 717 '&add ($h,$a4)', # h+=Ch(e,f,g) 718 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 719 '&and ($a3,$a2)', # (b^c)&(a^b) 720 721 '&xor ($a1,$a)', 722 '&add ($h,$a0)', # h+=Sigma1(e) 723 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 724 725 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 726 '&add ($d,$h)', # d+=h 727 '&add ($h,$a3)', # h+=Maj(a,b,c) 728 729 '&mov ($a0,$d)', 730 '&add ($a1,$h);'. # h+=Sigma0(a) 731 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 732 ); 733 } 734 735 ###################################################################### 736 # SSSE3 code path 737 # 738 if ($SZ==4) { # SHA256 only 739 my @X = map("%xmm$_",(0..3)); 740 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 741 742 $code.=<<___; 743 .type ${func}_ssse3,\@function,3 744 .align 64 745 ${func}_ssse3: 746 .Lssse3_shortcut: 747 push %rbx 748 push %rbp 749 push %r12 750 push %r13 751 push %r14 752 push %r15 753 mov %rsp,%r11 # copy %rsp 754 shl \$4,%rdx # num*16 755 sub \$`$framesz+$win64*16*4`,%rsp 756 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 757 and \$-64,%rsp # align stack frame 758 mov $ctx,$_ctx # save ctx, 1st arg 759 mov $inp,$_inp # save inp, 2nd arh 760 mov %rdx,$_end # save end pointer, "3rd" arg 761 mov %r11,$_rsp # save copy of %rsp 762 ___ 763 $code.=<<___ if ($win64); 764 movaps %xmm6,16*$SZ+32(%rsp) 765 movaps %xmm7,16*$SZ+48(%rsp) 766 movaps %xmm8,16*$SZ+64(%rsp) 767 movaps %xmm9,16*$SZ+80(%rsp) 768 ___ 769 $code.=<<___; 770 .Lprologue_ssse3: 771 772 mov $SZ*0($ctx),$A 773 mov $SZ*1($ctx),$B 774 mov $SZ*2($ctx),$C 775 mov $SZ*3($ctx),$D 776 mov $SZ*4($ctx),$E 777 mov $SZ*5($ctx),$F 778 mov $SZ*6($ctx),$G 779 mov $SZ*7($ctx),$H 780 ___ 781 782 $code.=<<___; 783 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 784 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 785 jmp .Lloop_ssse3 786 .align 16 787 .Lloop_ssse3: 788 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 789 movdqu 0x00($inp),@X[0] 790 movdqu 0x10($inp),@X[1] 791 movdqu 0x20($inp),@X[2] 792 pshufb $t3,@X[0] 793 movdqu 0x30($inp),@X[3] 794 lea $TABLE(%rip),$Tbl 795 pshufb $t3,@X[1] 796 movdqa 0x00($Tbl),$t0 797 movdqa 0x20($Tbl),$t1 798 pshufb $t3,@X[2] 799 paddd @X[0],$t0 800 movdqa 0x40($Tbl),$t2 801 pshufb $t3,@X[3] 802 movdqa 0x60($Tbl),$t3 803 paddd @X[1],$t1 804 paddd @X[2],$t2 805 paddd @X[3],$t3 806 movdqa $t0,0x00(%rsp) 807 mov $A,$a1 808 movdqa $t1,0x10(%rsp) 809 mov $B,$a3 810 movdqa $t2,0x20(%rsp) 811 xor $C,$a3 # magic 812 movdqa $t3,0x30(%rsp) 813 mov $E,$a0 814 jmp .Lssse3_00_47 815 816 .align 16 817 .Lssse3_00_47: 818 sub \$`-16*2*$SZ`,$Tbl # size optimization 819 ___ 820 sub Xupdate_256_SSSE3 () { 821 ( 822 '&movdqa ($t0,@X[1]);', 823 '&movdqa ($t3,@X[3])', 824 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 825 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 826 '&movdqa ($t1,$t0)', 827 '&movdqa ($t2,$t0);', 828 '&psrld ($t0,$sigma0[2])', 829 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 830 '&psrld ($t2,$sigma0[0])', 831 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 832 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 833 '&pxor ($t0,$t2)', 834 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 835 '&pxor ($t0,$t1)', 836 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 837 '&pxor ($t0,$t2);', 838 '&movdqa ($t2,$t3)', 839 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 840 '&psrld ($t3,$sigma1[2])', 841 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 842 '&psrlq ($t2,$sigma1[0])', 843 '&pxor ($t3,$t2);', 844 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 845 '&pxor ($t3,$t2)', 846 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 847 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 848 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 849 '&movdqa ($t2,$t3);', 850 '&psrld ($t3,$sigma1[2])', 851 '&psrlq ($t2,$sigma1[0])', 852 '&pxor ($t3,$t2);', 853 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 854 '&pxor ($t3,$t2);', 855 '&movdqa ($t2,16*2*$j."($Tbl)")', 856 '&pshufb ($t3,$t5)', 857 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 858 ); 859 } 860 861 sub SSSE3_256_00_47 () { 862 my $j = shift; 863 my $body = shift; 864 my @X = @_; 865 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 866 867 if (0) { 868 foreach (Xupdate_256_SSSE3()) { # 36 instructions 869 eval; 870 eval(shift(@insns)); 871 eval(shift(@insns)); 872 eval(shift(@insns)); 873 } 874 } else { # squeeze extra 4% on Westmere and 19% on Atom 875 eval(shift(@insns)); #@ 876 &movdqa ($t0,@X[1]); 877 eval(shift(@insns)); 878 eval(shift(@insns)); 879 &movdqa ($t3,@X[3]); 880 eval(shift(@insns)); #@ 881 eval(shift(@insns)); 882 eval(shift(@insns)); 883 eval(shift(@insns)); #@ 884 eval(shift(@insns)); 885 &palignr ($t0,@X[0],$SZ); # X[1..4] 886 eval(shift(@insns)); 887 eval(shift(@insns)); 888 &palignr ($t3,@X[2],$SZ); # X[9..12] 889 eval(shift(@insns)); 890 eval(shift(@insns)); 891 eval(shift(@insns)); 892 eval(shift(@insns)); #@ 893 &movdqa ($t1,$t0); 894 eval(shift(@insns)); 895 eval(shift(@insns)); 896 &movdqa ($t2,$t0); 897 eval(shift(@insns)); #@ 898 eval(shift(@insns)); 899 &psrld ($t0,$sigma0[2]); 900 eval(shift(@insns)); 901 eval(shift(@insns)); 902 eval(shift(@insns)); 903 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 904 eval(shift(@insns)); #@ 905 eval(shift(@insns)); 906 &psrld ($t2,$sigma0[0]); 907 eval(shift(@insns)); 908 eval(shift(@insns)); 909 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 910 eval(shift(@insns)); 911 eval(shift(@insns)); #@ 912 &pslld ($t1,8*$SZ-$sigma0[1]); 913 eval(shift(@insns)); 914 eval(shift(@insns)); 915 &pxor ($t0,$t2); 916 eval(shift(@insns)); #@ 917 eval(shift(@insns)); 918 eval(shift(@insns)); 919 eval(shift(@insns)); #@ 920 &psrld ($t2,$sigma0[1]-$sigma0[0]); 921 eval(shift(@insns)); 922 &pxor ($t0,$t1); 923 eval(shift(@insns)); 924 eval(shift(@insns)); 925 &pslld ($t1,$sigma0[1]-$sigma0[0]); 926 eval(shift(@insns)); 927 eval(shift(@insns)); 928 &pxor ($t0,$t2); 929 eval(shift(@insns)); 930 eval(shift(@insns)); #@ 931 &movdqa ($t2,$t3); 932 eval(shift(@insns)); 933 eval(shift(@insns)); 934 &pxor ($t0,$t1); # sigma0(X[1..4]) 935 eval(shift(@insns)); #@ 936 eval(shift(@insns)); 937 eval(shift(@insns)); 938 &psrld ($t3,$sigma1[2]); 939 eval(shift(@insns)); 940 eval(shift(@insns)); 941 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 942 eval(shift(@insns)); #@ 943 eval(shift(@insns)); 944 &psrlq ($t2,$sigma1[0]); 945 eval(shift(@insns)); 946 eval(shift(@insns)); 947 eval(shift(@insns)); 948 &pxor ($t3,$t2); 949 eval(shift(@insns)); #@ 950 eval(shift(@insns)); 951 eval(shift(@insns)); 952 eval(shift(@insns)); #@ 953 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 954 eval(shift(@insns)); 955 eval(shift(@insns)); 956 &pxor ($t3,$t2); 957 eval(shift(@insns)); #@ 958 eval(shift(@insns)); 959 eval(shift(@insns)); 960 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 961 &pshufd ($t3,$t3,0b10000000); 962 eval(shift(@insns)); 963 eval(shift(@insns)); 964 eval(shift(@insns)); 965 &psrldq ($t3,8); 966 eval(shift(@insns)); 967 eval(shift(@insns)); #@ 968 eval(shift(@insns)); 969 eval(shift(@insns)); 970 eval(shift(@insns)); #@ 971 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 972 eval(shift(@insns)); 973 eval(shift(@insns)); 974 eval(shift(@insns)); 975 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 976 eval(shift(@insns)); 977 eval(shift(@insns)); #@ 978 eval(shift(@insns)); 979 &movdqa ($t2,$t3); 980 eval(shift(@insns)); 981 eval(shift(@insns)); 982 &psrld ($t3,$sigma1[2]); 983 eval(shift(@insns)); 984 eval(shift(@insns)); #@ 985 &psrlq ($t2,$sigma1[0]); 986 eval(shift(@insns)); 987 eval(shift(@insns)); 988 &pxor ($t3,$t2); 989 eval(shift(@insns)); #@ 990 eval(shift(@insns)); 991 eval(shift(@insns)); 992 eval(shift(@insns)); #@ 993 eval(shift(@insns)); 994 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 995 eval(shift(@insns)); 996 eval(shift(@insns)); 997 eval(shift(@insns)); 998 &pxor ($t3,$t2); 999 eval(shift(@insns)); 1000 eval(shift(@insns)); 1001 eval(shift(@insns)); #@ 1002 #&pshufb ($t3,$t5); 1003 &pshufd ($t3,$t3,0b00001000); 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); 1006 &movdqa ($t2,16*2*$j."($Tbl)"); 1007 eval(shift(@insns)); #@ 1008 eval(shift(@insns)); 1009 &pslldq ($t3,8); 1010 eval(shift(@insns)); 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); 1013 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1014 eval(shift(@insns)); #@ 1015 eval(shift(@insns)); 1016 eval(shift(@insns)); 1017 } 1018 &paddd ($t2,@X[0]); 1019 foreach (@insns) { eval; } # remaining instructions 1020 &movdqa (16*$j."(%rsp)",$t2); 1021 } 1022 1023 for ($i=0,$j=0; $j<4; $j++) { 1024 &SSSE3_256_00_47($j,\&body_00_15,@X); 1025 push(@X,shift(@X)); # rotate(@X) 1026 } 1027 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1028 &jne (".Lssse3_00_47"); 1029 1030 for ($i=0; $i<16; ) { 1031 foreach(body_00_15()) { eval; } 1032 } 1033 $code.=<<___; 1034 mov $_ctx,$ctx 1035 mov $a1,$A 1036 1037 add $SZ*0($ctx),$A 1038 lea 16*$SZ($inp),$inp 1039 add $SZ*1($ctx),$B 1040 add $SZ*2($ctx),$C 1041 add $SZ*3($ctx),$D 1042 add $SZ*4($ctx),$E 1043 add $SZ*5($ctx),$F 1044 add $SZ*6($ctx),$G 1045 add $SZ*7($ctx),$H 1046 1047 cmp $_end,$inp 1048 1049 mov $A,$SZ*0($ctx) 1050 mov $B,$SZ*1($ctx) 1051 mov $C,$SZ*2($ctx) 1052 mov $D,$SZ*3($ctx) 1053 mov $E,$SZ*4($ctx) 1054 mov $F,$SZ*5($ctx) 1055 mov $G,$SZ*6($ctx) 1056 mov $H,$SZ*7($ctx) 1057 jb .Lloop_ssse3 1058 1059 mov $_rsp,%rsi 1060 ___ 1061 $code.=<<___ if ($win64); 1062 movaps 16*$SZ+32(%rsp),%xmm6 1063 movaps 16*$SZ+48(%rsp),%xmm7 1064 movaps 16*$SZ+64(%rsp),%xmm8 1065 movaps 16*$SZ+80(%rsp),%xmm9 1066 ___ 1067 $code.=<<___; 1068 mov (%rsi),%r15 1069 mov 8(%rsi),%r14 1070 mov 16(%rsi),%r13 1071 mov 24(%rsi),%r12 1072 mov 32(%rsi),%rbp 1073 mov 40(%rsi),%rbx 1074 lea 48(%rsi),%rsp 1075 .Lepilogue_ssse3: 1076 ret 1077 .size ${func}_ssse3,.-${func}_ssse3 1078 ___ 1079 } 1080 1081 if ($avx) {{ 1082 ###################################################################### 1083 # XOP code path 1084 # 1085 if ($SZ==8) { # SHA512 only 1086 $code.=<<___; 1087 .type ${func}_xop,\@function,3 1088 .align 64 1089 ${func}_xop: 1090 .Lxop_shortcut: 1091 push %rbx 1092 push %rbp 1093 push %r12 1094 push %r13 1095 push %r14 1096 push %r15 1097 mov %rsp,%r11 # copy %rsp 1098 shl \$4,%rdx # num*16 1099 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1100 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1101 and \$-64,%rsp # align stack frame 1102 mov $ctx,$_ctx # save ctx, 1st arg 1103 mov $inp,$_inp # save inp, 2nd arh 1104 mov %rdx,$_end # save end pointer, "3rd" arg 1105 mov %r11,$_rsp # save copy of %rsp 1106 ___ 1107 $code.=<<___ if ($win64); 1108 movaps %xmm6,16*$SZ+32(%rsp) 1109 movaps %xmm7,16*$SZ+48(%rsp) 1110 movaps %xmm8,16*$SZ+64(%rsp) 1111 movaps %xmm9,16*$SZ+80(%rsp) 1112 ___ 1113 $code.=<<___ if ($win64 && $SZ>4); 1114 movaps %xmm10,16*$SZ+96(%rsp) 1115 movaps %xmm11,16*$SZ+112(%rsp) 1116 ___ 1117 $code.=<<___; 1118 .Lprologue_xop: 1119 1120 vzeroupper 1121 mov $SZ*0($ctx),$A 1122 mov $SZ*1($ctx),$B 1123 mov $SZ*2($ctx),$C 1124 mov $SZ*3($ctx),$D 1125 mov $SZ*4($ctx),$E 1126 mov $SZ*5($ctx),$F 1127 mov $SZ*6($ctx),$G 1128 mov $SZ*7($ctx),$H 1129 jmp .Lloop_xop 1130 ___ 1131 if ($SZ==4) { # SHA256 1132 my @X = map("%xmm$_",(0..3)); 1133 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1134 1135 $code.=<<___; 1136 .align 16 1137 .Lloop_xop: 1138 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1139 vmovdqu 0x00($inp),@X[0] 1140 vmovdqu 0x10($inp),@X[1] 1141 vmovdqu 0x20($inp),@X[2] 1142 vmovdqu 0x30($inp),@X[3] 1143 vpshufb $t3,@X[0],@X[0] 1144 lea $TABLE(%rip),$Tbl 1145 vpshufb $t3,@X[1],@X[1] 1146 vpshufb $t3,@X[2],@X[2] 1147 vpaddd 0x00($Tbl),@X[0],$t0 1148 vpshufb $t3,@X[3],@X[3] 1149 vpaddd 0x20($Tbl),@X[1],$t1 1150 vpaddd 0x40($Tbl),@X[2],$t2 1151 vpaddd 0x60($Tbl),@X[3],$t3 1152 vmovdqa $t0,0x00(%rsp) 1153 mov $A,$a1 1154 vmovdqa $t1,0x10(%rsp) 1155 mov $B,$a3 1156 vmovdqa $t2,0x20(%rsp) 1157 xor $C,$a3 # magic 1158 vmovdqa $t3,0x30(%rsp) 1159 mov $E,$a0 1160 jmp .Lxop_00_47 1161 1162 .align 16 1163 .Lxop_00_47: 1164 sub \$`-16*2*$SZ`,$Tbl # size optimization 1165 ___ 1166 sub XOP_256_00_47 () { 1167 my $j = shift; 1168 my $body = shift; 1169 my @X = @_; 1170 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1171 1172 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1173 eval(shift(@insns)); 1174 eval(shift(@insns)); 1175 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1176 eval(shift(@insns)); 1177 eval(shift(@insns)); 1178 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1179 eval(shift(@insns)); 1180 eval(shift(@insns)); 1181 &vpsrld ($t0,$t0,$sigma0[2]); 1182 eval(shift(@insns)); 1183 eval(shift(@insns)); 1184 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1185 eval(shift(@insns)); 1186 eval(shift(@insns)); 1187 eval(shift(@insns)); 1188 eval(shift(@insns)); 1189 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1190 eval(shift(@insns)); 1191 eval(shift(@insns)); 1192 &vpxor ($t0,$t0,$t1); 1193 eval(shift(@insns)); 1194 eval(shift(@insns)); 1195 eval(shift(@insns)); 1196 eval(shift(@insns)); 1197 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1198 eval(shift(@insns)); 1199 eval(shift(@insns)); 1200 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1201 eval(shift(@insns)); 1202 eval(shift(@insns)); 1203 &vpsrld ($t2,@X[3],$sigma1[2]); 1204 eval(shift(@insns)); 1205 eval(shift(@insns)); 1206 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1207 eval(shift(@insns)); 1208 eval(shift(@insns)); 1209 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1210 eval(shift(@insns)); 1211 eval(shift(@insns)); 1212 &vpxor ($t3,$t3,$t2); 1213 eval(shift(@insns)); 1214 eval(shift(@insns)); 1215 eval(shift(@insns)); 1216 eval(shift(@insns)); 1217 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1218 eval(shift(@insns)); 1219 eval(shift(@insns)); 1220 eval(shift(@insns)); 1221 eval(shift(@insns)); 1222 &vpsrldq ($t3,$t3,8); 1223 eval(shift(@insns)); 1224 eval(shift(@insns)); 1225 eval(shift(@insns)); 1226 eval(shift(@insns)); 1227 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1228 eval(shift(@insns)); 1229 eval(shift(@insns)); 1230 eval(shift(@insns)); 1231 eval(shift(@insns)); 1232 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1233 eval(shift(@insns)); 1234 eval(shift(@insns)); 1235 &vpsrld ($t2,@X[0],$sigma1[2]); 1236 eval(shift(@insns)); 1237 eval(shift(@insns)); 1238 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1239 eval(shift(@insns)); 1240 eval(shift(@insns)); 1241 &vpxor ($t3,$t3,$t2); 1242 eval(shift(@insns)); 1243 eval(shift(@insns)); 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1247 eval(shift(@insns)); 1248 eval(shift(@insns)); 1249 eval(shift(@insns)); 1250 eval(shift(@insns)); 1251 &vpslldq ($t3,$t3,8); # 22 instructions 1252 eval(shift(@insns)); 1253 eval(shift(@insns)); 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1257 eval(shift(@insns)); 1258 eval(shift(@insns)); 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1262 foreach (@insns) { eval; } # remaining instructions 1263 &vmovdqa (16*$j."(%rsp)",$t2); 1264 } 1265 1266 for ($i=0,$j=0; $j<4; $j++) { 1267 &XOP_256_00_47($j,\&body_00_15,@X); 1268 push(@X,shift(@X)); # rotate(@X) 1269 } 1270 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1271 &jne (".Lxop_00_47"); 1272 1273 for ($i=0; $i<16; ) { 1274 foreach(body_00_15()) { eval; } 1275 } 1276 1277 } else { # SHA512 1278 my @X = map("%xmm$_",(0..7)); 1279 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1280 1281 $code.=<<___; 1282 .align 16 1283 .Lloop_xop: 1284 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1285 vmovdqu 0x00($inp),@X[0] 1286 lea $TABLE+0x80(%rip),$Tbl # size optimization 1287 vmovdqu 0x10($inp),@X[1] 1288 vmovdqu 0x20($inp),@X[2] 1289 vpshufb $t3,@X[0],@X[0] 1290 vmovdqu 0x30($inp),@X[3] 1291 vpshufb $t3,@X[1],@X[1] 1292 vmovdqu 0x40($inp),@X[4] 1293 vpshufb $t3,@X[2],@X[2] 1294 vmovdqu 0x50($inp),@X[5] 1295 vpshufb $t3,@X[3],@X[3] 1296 vmovdqu 0x60($inp),@X[6] 1297 vpshufb $t3,@X[4],@X[4] 1298 vmovdqu 0x70($inp),@X[7] 1299 vpshufb $t3,@X[5],@X[5] 1300 vpaddq -0x80($Tbl),@X[0],$t0 1301 vpshufb $t3,@X[6],@X[6] 1302 vpaddq -0x60($Tbl),@X[1],$t1 1303 vpshufb $t3,@X[7],@X[7] 1304 vpaddq -0x40($Tbl),@X[2],$t2 1305 vpaddq -0x20($Tbl),@X[3],$t3 1306 vmovdqa $t0,0x00(%rsp) 1307 vpaddq 0x00($Tbl),@X[4],$t0 1308 vmovdqa $t1,0x10(%rsp) 1309 vpaddq 0x20($Tbl),@X[5],$t1 1310 vmovdqa $t2,0x20(%rsp) 1311 vpaddq 0x40($Tbl),@X[6],$t2 1312 vmovdqa $t3,0x30(%rsp) 1313 vpaddq 0x60($Tbl),@X[7],$t3 1314 vmovdqa $t0,0x40(%rsp) 1315 mov $A,$a1 1316 vmovdqa $t1,0x50(%rsp) 1317 mov $B,$a3 1318 vmovdqa $t2,0x60(%rsp) 1319 xor $C,$a3 # magic 1320 vmovdqa $t3,0x70(%rsp) 1321 mov $E,$a0 1322 jmp .Lxop_00_47 1323 1324 .align 16 1325 .Lxop_00_47: 1326 add \$`16*2*$SZ`,$Tbl 1327 ___ 1328 sub XOP_512_00_47 () { 1329 my $j = shift; 1330 my $body = shift; 1331 my @X = @_; 1332 my @insns = (&$body,&$body); # 52 instructions 1333 1334 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1335 eval(shift(@insns)); 1336 eval(shift(@insns)); 1337 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1338 eval(shift(@insns)); 1339 eval(shift(@insns)); 1340 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1341 eval(shift(@insns)); 1342 eval(shift(@insns)); 1343 &vpsrlq ($t0,$t0,$sigma0[2]); 1344 eval(shift(@insns)); 1345 eval(shift(@insns)); 1346 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1347 eval(shift(@insns)); 1348 eval(shift(@insns)); 1349 eval(shift(@insns)); 1350 eval(shift(@insns)); 1351 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1352 eval(shift(@insns)); 1353 eval(shift(@insns)); 1354 &vpxor ($t0,$t0,$t1); 1355 eval(shift(@insns)); 1356 eval(shift(@insns)); 1357 eval(shift(@insns)); 1358 eval(shift(@insns)); 1359 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1360 eval(shift(@insns)); 1361 eval(shift(@insns)); 1362 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1363 eval(shift(@insns)); 1364 eval(shift(@insns)); 1365 &vpsrlq ($t2,@X[7],$sigma1[2]); 1366 eval(shift(@insns)); 1367 eval(shift(@insns)); 1368 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1369 eval(shift(@insns)); 1370 eval(shift(@insns)); 1371 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1372 eval(shift(@insns)); 1373 eval(shift(@insns)); 1374 &vpxor ($t3,$t3,$t2); 1375 eval(shift(@insns)); 1376 eval(shift(@insns)); 1377 eval(shift(@insns)); 1378 eval(shift(@insns)); 1379 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1380 eval(shift(@insns)); 1381 eval(shift(@insns)); 1382 eval(shift(@insns)); 1383 eval(shift(@insns)); 1384 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1385 eval(shift(@insns)); 1386 eval(shift(@insns)); 1387 eval(shift(@insns)); 1388 eval(shift(@insns)); 1389 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1390 foreach (@insns) { eval; } # remaining instructions 1391 &vmovdqa (16*$j."(%rsp)",$t2); 1392 } 1393 1394 for ($i=0,$j=0; $j<8; $j++) { 1395 &XOP_512_00_47($j,\&body_00_15,@X); 1396 push(@X,shift(@X)); # rotate(@X) 1397 } 1398 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1399 &jne (".Lxop_00_47"); 1400 1401 for ($i=0; $i<16; ) { 1402 foreach(body_00_15()) { eval; } 1403 } 1404 } 1405 $code.=<<___; 1406 mov $_ctx,$ctx 1407 mov $a1,$A 1408 1409 add $SZ*0($ctx),$A 1410 lea 16*$SZ($inp),$inp 1411 add $SZ*1($ctx),$B 1412 add $SZ*2($ctx),$C 1413 add $SZ*3($ctx),$D 1414 add $SZ*4($ctx),$E 1415 add $SZ*5($ctx),$F 1416 add $SZ*6($ctx),$G 1417 add $SZ*7($ctx),$H 1418 1419 cmp $_end,$inp 1420 1421 mov $A,$SZ*0($ctx) 1422 mov $B,$SZ*1($ctx) 1423 mov $C,$SZ*2($ctx) 1424 mov $D,$SZ*3($ctx) 1425 mov $E,$SZ*4($ctx) 1426 mov $F,$SZ*5($ctx) 1427 mov $G,$SZ*6($ctx) 1428 mov $H,$SZ*7($ctx) 1429 jb .Lloop_xop 1430 1431 mov $_rsp,%rsi 1432 vzeroupper 1433 ___ 1434 $code.=<<___ if ($win64); 1435 movaps 16*$SZ+32(%rsp),%xmm6 1436 movaps 16*$SZ+48(%rsp),%xmm7 1437 movaps 16*$SZ+64(%rsp),%xmm8 1438 movaps 16*$SZ+80(%rsp),%xmm9 1439 ___ 1440 $code.=<<___ if ($win64 && $SZ>4); 1441 movaps 16*$SZ+96(%rsp),%xmm10 1442 movaps 16*$SZ+112(%rsp),%xmm11 1443 ___ 1444 $code.=<<___; 1445 mov (%rsi),%r15 1446 mov 8(%rsi),%r14 1447 mov 16(%rsi),%r13 1448 mov 24(%rsi),%r12 1449 mov 32(%rsi),%rbp 1450 mov 40(%rsi),%rbx 1451 lea 48(%rsi),%rsp 1452 .Lepilogue_xop: 1453 ret 1454 .size ${func}_xop,.-${func}_xop 1455 ___ 1456 } 1457 ###################################################################### 1458 # AVX+shrd code path 1459 # 1460 local *ror = sub { &shrd(@_[0],@_) }; 1461 1462 $code.=<<___; 1463 .type ${func}_avx,\@function,3 1464 .align 64 1465 ${func}_avx: 1466 .Lavx_shortcut: 1467 push %rbx 1468 push %rbp 1469 push %r12 1470 push %r13 1471 push %r14 1472 push %r15 1473 mov %rsp,%r11 # copy %rsp 1474 shl \$4,%rdx # num*16 1475 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1476 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1477 and \$-64,%rsp # align stack frame 1478 mov $ctx,$_ctx # save ctx, 1st arg 1479 mov $inp,$_inp # save inp, 2nd arh 1480 mov %rdx,$_end # save end pointer, "3rd" arg 1481 mov %r11,$_rsp # save copy of %rsp 1482 ___ 1483 $code.=<<___ if ($win64); 1484 movaps %xmm6,16*$SZ+32(%rsp) 1485 movaps %xmm7,16*$SZ+48(%rsp) 1486 movaps %xmm8,16*$SZ+64(%rsp) 1487 movaps %xmm9,16*$SZ+80(%rsp) 1488 ___ 1489 $code.=<<___ if ($win64 && $SZ>4); 1490 movaps %xmm10,16*$SZ+96(%rsp) 1491 movaps %xmm11,16*$SZ+112(%rsp) 1492 ___ 1493 $code.=<<___; 1494 .Lprologue_avx: 1495 1496 vzeroupper 1497 mov $SZ*0($ctx),$A 1498 mov $SZ*1($ctx),$B 1499 mov $SZ*2($ctx),$C 1500 mov $SZ*3($ctx),$D 1501 mov $SZ*4($ctx),$E 1502 mov $SZ*5($ctx),$F 1503 mov $SZ*6($ctx),$G 1504 mov $SZ*7($ctx),$H 1505 ___ 1506 if ($SZ==4) { # SHA256 1507 my @X = map("%xmm$_",(0..3)); 1508 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1509 1510 $code.=<<___; 1511 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1512 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1513 jmp .Lloop_avx 1514 .align 16 1515 .Lloop_avx: 1516 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1517 vmovdqu 0x00($inp),@X[0] 1518 vmovdqu 0x10($inp),@X[1] 1519 vmovdqu 0x20($inp),@X[2] 1520 vmovdqu 0x30($inp),@X[3] 1521 vpshufb $t3,@X[0],@X[0] 1522 lea $TABLE(%rip),$Tbl 1523 vpshufb $t3,@X[1],@X[1] 1524 vpshufb $t3,@X[2],@X[2] 1525 vpaddd 0x00($Tbl),@X[0],$t0 1526 vpshufb $t3,@X[3],@X[3] 1527 vpaddd 0x20($Tbl),@X[1],$t1 1528 vpaddd 0x40($Tbl),@X[2],$t2 1529 vpaddd 0x60($Tbl),@X[3],$t3 1530 vmovdqa $t0,0x00(%rsp) 1531 mov $A,$a1 1532 vmovdqa $t1,0x10(%rsp) 1533 mov $B,$a3 1534 vmovdqa $t2,0x20(%rsp) 1535 xor $C,$a3 # magic 1536 vmovdqa $t3,0x30(%rsp) 1537 mov $E,$a0 1538 jmp .Lavx_00_47 1539 1540 .align 16 1541 .Lavx_00_47: 1542 sub \$`-16*2*$SZ`,$Tbl # size optimization 1543 ___ 1544 sub Xupdate_256_AVX () { 1545 ( 1546 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1547 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1548 '&vpsrld ($t2,$t0,$sigma0[0]);', 1549 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1550 '&vpsrld ($t3,$t0,$sigma0[2])', 1551 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1552 '&vpxor ($t0,$t3,$t2)', 1553 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1554 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1555 '&vpxor ($t0,$t0,$t1)', 1556 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1557 '&vpxor ($t0,$t0,$t2)', 1558 '&vpsrld ($t2,$t3,$sigma1[2]);', 1559 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1560 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1561 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1562 '&vpxor ($t2,$t2,$t3);', 1563 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1564 '&vpxor ($t2,$t2,$t3)', 1565 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1566 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1567 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1568 '&vpsrld ($t2,$t3,$sigma1[2])', 1569 '&vpsrlq ($t3,$t3,$sigma1[0])', 1570 '&vpxor ($t2,$t2,$t3);', 1571 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1572 '&vpxor ($t2,$t2,$t3)', 1573 '&vpshufb ($t2,$t2,$t5)', 1574 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1575 ); 1576 } 1577 1578 sub AVX_256_00_47 () { 1579 my $j = shift; 1580 my $body = shift; 1581 my @X = @_; 1582 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1583 1584 foreach (Xupdate_256_AVX()) { # 29 instructions 1585 eval; 1586 eval(shift(@insns)); 1587 eval(shift(@insns)); 1588 eval(shift(@insns)); 1589 } 1590 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1591 foreach (@insns) { eval; } # remaining instructions 1592 &vmovdqa (16*$j."(%rsp)",$t2); 1593 } 1594 1595 for ($i=0,$j=0; $j<4; $j++) { 1596 &AVX_256_00_47($j,\&body_00_15,@X); 1597 push(@X,shift(@X)); # rotate(@X) 1598 } 1599 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1600 &jne (".Lavx_00_47"); 1601 1602 for ($i=0; $i<16; ) { 1603 foreach(body_00_15()) { eval; } 1604 } 1605 1606 } else { # SHA512 1607 my @X = map("%xmm$_",(0..7)); 1608 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1609 1610 $code.=<<___; 1611 jmp .Lloop_avx 1612 .align 16 1613 .Lloop_avx: 1614 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1615 vmovdqu 0x00($inp),@X[0] 1616 lea $TABLE+0x80(%rip),$Tbl # size optimization 1617 vmovdqu 0x10($inp),@X[1] 1618 vmovdqu 0x20($inp),@X[2] 1619 vpshufb $t3,@X[0],@X[0] 1620 vmovdqu 0x30($inp),@X[3] 1621 vpshufb $t3,@X[1],@X[1] 1622 vmovdqu 0x40($inp),@X[4] 1623 vpshufb $t3,@X[2],@X[2] 1624 vmovdqu 0x50($inp),@X[5] 1625 vpshufb $t3,@X[3],@X[3] 1626 vmovdqu 0x60($inp),@X[6] 1627 vpshufb $t3,@X[4],@X[4] 1628 vmovdqu 0x70($inp),@X[7] 1629 vpshufb $t3,@X[5],@X[5] 1630 vpaddq -0x80($Tbl),@X[0],$t0 1631 vpshufb $t3,@X[6],@X[6] 1632 vpaddq -0x60($Tbl),@X[1],$t1 1633 vpshufb $t3,@X[7],@X[7] 1634 vpaddq -0x40($Tbl),@X[2],$t2 1635 vpaddq -0x20($Tbl),@X[3],$t3 1636 vmovdqa $t0,0x00(%rsp) 1637 vpaddq 0x00($Tbl),@X[4],$t0 1638 vmovdqa $t1,0x10(%rsp) 1639 vpaddq 0x20($Tbl),@X[5],$t1 1640 vmovdqa $t2,0x20(%rsp) 1641 vpaddq 0x40($Tbl),@X[6],$t2 1642 vmovdqa $t3,0x30(%rsp) 1643 vpaddq 0x60($Tbl),@X[7],$t3 1644 vmovdqa $t0,0x40(%rsp) 1645 mov $A,$a1 1646 vmovdqa $t1,0x50(%rsp) 1647 mov $B,$a3 1648 vmovdqa $t2,0x60(%rsp) 1649 xor $C,$a3 # magic 1650 vmovdqa $t3,0x70(%rsp) 1651 mov $E,$a0 1652 jmp .Lavx_00_47 1653 1654 .align 16 1655 .Lavx_00_47: 1656 add \$`16*2*$SZ`,$Tbl 1657 ___ 1658 sub Xupdate_512_AVX () { 1659 ( 1660 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1661 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1662 '&vpsrlq ($t2,$t0,$sigma0[0])', 1663 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1664 '&vpsrlq ($t3,$t0,$sigma0[2])', 1665 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1666 '&vpxor ($t0,$t3,$t2)', 1667 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1668 '&vpxor ($t0,$t0,$t1)', 1669 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1670 '&vpxor ($t0,$t0,$t2)', 1671 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1672 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1673 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1674 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1675 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1676 '&vpxor ($t3,$t3,$t2)', 1677 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1678 '&vpxor ($t3,$t3,$t1)', 1679 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1680 '&vpxor ($t3,$t3,$t2)', 1681 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1682 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1683 ); 1684 } 1685 1686 sub AVX_512_00_47 () { 1687 my $j = shift; 1688 my $body = shift; 1689 my @X = @_; 1690 my @insns = (&$body,&$body); # 52 instructions 1691 1692 foreach (Xupdate_512_AVX()) { # 23 instructions 1693 eval; 1694 eval(shift(@insns)); 1695 eval(shift(@insns)); 1696 } 1697 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1698 foreach (@insns) { eval; } # remaining instructions 1699 &vmovdqa (16*$j."(%rsp)",$t2); 1700 } 1701 1702 for ($i=0,$j=0; $j<8; $j++) { 1703 &AVX_512_00_47($j,\&body_00_15,@X); 1704 push(@X,shift(@X)); # rotate(@X) 1705 } 1706 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1707 &jne (".Lavx_00_47"); 1708 1709 for ($i=0; $i<16; ) { 1710 foreach(body_00_15()) { eval; } 1711 } 1712 } 1713 $code.=<<___; 1714 mov $_ctx,$ctx 1715 mov $a1,$A 1716 1717 add $SZ*0($ctx),$A 1718 lea 16*$SZ($inp),$inp 1719 add $SZ*1($ctx),$B 1720 add $SZ*2($ctx),$C 1721 add $SZ*3($ctx),$D 1722 add $SZ*4($ctx),$E 1723 add $SZ*5($ctx),$F 1724 add $SZ*6($ctx),$G 1725 add $SZ*7($ctx),$H 1726 1727 cmp $_end,$inp 1728 1729 mov $A,$SZ*0($ctx) 1730 mov $B,$SZ*1($ctx) 1731 mov $C,$SZ*2($ctx) 1732 mov $D,$SZ*3($ctx) 1733 mov $E,$SZ*4($ctx) 1734 mov $F,$SZ*5($ctx) 1735 mov $G,$SZ*6($ctx) 1736 mov $H,$SZ*7($ctx) 1737 jb .Lloop_avx 1738 1739 mov $_rsp,%rsi 1740 vzeroupper 1741 ___ 1742 $code.=<<___ if ($win64); 1743 movaps 16*$SZ+32(%rsp),%xmm6 1744 movaps 16*$SZ+48(%rsp),%xmm7 1745 movaps 16*$SZ+64(%rsp),%xmm8 1746 movaps 16*$SZ+80(%rsp),%xmm9 1747 ___ 1748 $code.=<<___ if ($win64 && $SZ>4); 1749 movaps 16*$SZ+96(%rsp),%xmm10 1750 movaps 16*$SZ+112(%rsp),%xmm11 1751 ___ 1752 $code.=<<___; 1753 mov (%rsi),%r15 1754 mov 8(%rsi),%r14 1755 mov 16(%rsi),%r13 1756 mov 24(%rsi),%r12 1757 mov 32(%rsi),%rbp 1758 mov 40(%rsi),%rbx 1759 lea 48(%rsi),%rsp 1760 .Lepilogue_avx: 1761 ret 1762 .size ${func}_avx,.-${func}_avx 1763 ___ 1764 1765 if ($avx>1) {{ 1766 ###################################################################### 1767 # AVX2+BMI code path 1768 # 1769 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1770 my $PUSH8=8*2*$SZ; 1771 use integer; 1772 1773 sub bodyx_00_15 () { 1774 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1775 ( 1776 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1777 1778 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1779 '&and ($a4,$e)', # f&e 1780 '&rorx ($a0,$e,$Sigma1[2])', 1781 '&rorx ($a2,$e,$Sigma1[1])', 1782 1783 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1784 '&lea ($h,"($h,$a4)")', 1785 '&andn ($a4,$e,$g)', # ~e&g 1786 '&xor ($a0,$a2)', 1787 1788 '&rorx ($a1,$e,$Sigma1[0])', 1789 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1790 '&xor ($a0,$a1)', # Sigma1(e) 1791 '&mov ($a2,$a)', 1792 1793 '&rorx ($a4,$a,$Sigma0[2])', 1794 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1795 '&xor ($a2,$b)', # a^b, b^c in next round 1796 '&rorx ($a1,$a,$Sigma0[1])', 1797 1798 '&rorx ($a0,$a,$Sigma0[0])', 1799 '&lea ($d,"($d,$h)")', # d+=h 1800 '&and ($a3,$a2)', # (b^c)&(a^b) 1801 '&xor ($a1,$a4)', 1802 1803 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1804 '&xor ($a1,$a0)', # Sigma0(a) 1805 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1806 '&mov ($a4,$e)', # copy of f in future 1807 1808 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1809 ); 1810 # and at the finish one has to $a+=$a1 1811 } 1812 1813 $code.=<<___; 1814 .type ${func}_avx2,\@function,3 1815 .align 64 1816 ${func}_avx2: 1817 .Lavx2_shortcut: 1818 push %rbx 1819 push %rbp 1820 push %r12 1821 push %r13 1822 push %r14 1823 push %r15 1824 mov %rsp,%r11 # copy %rsp 1825 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1826 shl \$4,%rdx # num*16 1827 and \$-256*$SZ,%rsp # align stack frame 1828 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1829 add \$`2*$SZ*($rounds-8)`,%rsp 1830 mov $ctx,$_ctx # save ctx, 1st arg 1831 mov $inp,$_inp # save inp, 2nd arh 1832 mov %rdx,$_end # save end pointer, "3rd" arg 1833 mov %r11,$_rsp # save copy of %rsp 1834 ___ 1835 $code.=<<___ if ($win64); 1836 movaps %xmm6,16*$SZ+32(%rsp) 1837 movaps %xmm7,16*$SZ+48(%rsp) 1838 movaps %xmm8,16*$SZ+64(%rsp) 1839 movaps %xmm9,16*$SZ+80(%rsp) 1840 ___ 1841 $code.=<<___ if ($win64 && $SZ>4); 1842 movaps %xmm10,16*$SZ+96(%rsp) 1843 movaps %xmm11,16*$SZ+112(%rsp) 1844 ___ 1845 $code.=<<___; 1846 .Lprologue_avx2: 1847 1848 vzeroupper 1849 sub \$-16*$SZ,$inp # inp++, size optimization 1850 mov $SZ*0($ctx),$A 1851 mov $inp,%r12 # borrow $T1 1852 mov $SZ*1($ctx),$B 1853 cmp %rdx,$inp # $_end 1854 mov $SZ*2($ctx),$C 1855 cmove %rsp,%r12 # next block or random data 1856 mov $SZ*3($ctx),$D 1857 mov $SZ*4($ctx),$E 1858 mov $SZ*5($ctx),$F 1859 mov $SZ*6($ctx),$G 1860 mov $SZ*7($ctx),$H 1861 ___ 1862 if ($SZ==4) { # SHA256 1863 my @X = map("%ymm$_",(0..3)); 1864 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1865 1866 $code.=<<___; 1867 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1868 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1869 jmp .Loop_avx2 1870 .align 16 1871 .Loop_avx2: 1872 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1873 vmovdqu -16*$SZ+0($inp),%xmm0 1874 vmovdqu -16*$SZ+16($inp),%xmm1 1875 vmovdqu -16*$SZ+32($inp),%xmm2 1876 vmovdqu -16*$SZ+48($inp),%xmm3 1877 #mov $inp,$_inp # offload $inp 1878 vinserti128 \$1,(%r12),@X[0],@X[0] 1879 vinserti128 \$1,16(%r12),@X[1],@X[1] 1880 vpshufb $t3,@X[0],@X[0] 1881 vinserti128 \$1,32(%r12),@X[2],@X[2] 1882 vpshufb $t3,@X[1],@X[1] 1883 vinserti128 \$1,48(%r12),@X[3],@X[3] 1884 1885 lea $TABLE(%rip),$Tbl 1886 vpshufb $t3,@X[2],@X[2] 1887 vpaddd 0x00($Tbl),@X[0],$t0 1888 vpshufb $t3,@X[3],@X[3] 1889 vpaddd 0x20($Tbl),@X[1],$t1 1890 vpaddd 0x40($Tbl),@X[2],$t2 1891 vpaddd 0x60($Tbl),@X[3],$t3 1892 vmovdqa $t0,0x00(%rsp) 1893 xor $a1,$a1 1894 vmovdqa $t1,0x20(%rsp) 1895 lea -$PUSH8(%rsp),%rsp 1896 mov $B,$a3 1897 vmovdqa $t2,0x00(%rsp) 1898 xor $C,$a3 # magic 1899 vmovdqa $t3,0x20(%rsp) 1900 mov $F,$a4 1901 sub \$-16*2*$SZ,$Tbl # size optimization 1902 jmp .Lavx2_00_47 1903 1904 .align 16 1905 .Lavx2_00_47: 1906 ___ 1907 1908 sub AVX2_256_00_47 () { 1909 my $j = shift; 1910 my $body = shift; 1911 my @X = @_; 1912 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1913 my $base = "+2*$PUSH8(%rsp)"; 1914 1915 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1916 foreach (Xupdate_256_AVX()) { # 29 instructions 1917 eval; 1918 eval(shift(@insns)); 1919 eval(shift(@insns)); 1920 eval(shift(@insns)); 1921 } 1922 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1923 foreach (@insns) { eval; } # remaining instructions 1924 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1925 } 1926 1927 for ($i=0,$j=0; $j<4; $j++) { 1928 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1929 push(@X,shift(@X)); # rotate(@X) 1930 } 1931 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1932 &cmpb (($SZ-1)."($Tbl)",0); 1933 &jne (".Lavx2_00_47"); 1934 1935 for ($i=0; $i<16; ) { 1936 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1937 foreach(bodyx_00_15()) { eval; } 1938 } 1939 } else { # SHA512 1940 my @X = map("%ymm$_",(0..7)); 1941 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1942 1943 $code.=<<___; 1944 jmp .Loop_avx2 1945 .align 16 1946 .Loop_avx2: 1947 vmovdqu -16*$SZ($inp),%xmm0 1948 vmovdqu -16*$SZ+16($inp),%xmm1 1949 vmovdqu -16*$SZ+32($inp),%xmm2 1950 lea $TABLE+0x80(%rip),$Tbl # size optimization 1951 vmovdqu -16*$SZ+48($inp),%xmm3 1952 vmovdqu -16*$SZ+64($inp),%xmm4 1953 vmovdqu -16*$SZ+80($inp),%xmm5 1954 vmovdqu -16*$SZ+96($inp),%xmm6 1955 vmovdqu -16*$SZ+112($inp),%xmm7 1956 #mov $inp,$_inp # offload $inp 1957 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1958 vinserti128 \$1,(%r12),@X[0],@X[0] 1959 vinserti128 \$1,16(%r12),@X[1],@X[1] 1960 vpshufb $t2,@X[0],@X[0] 1961 vinserti128 \$1,32(%r12),@X[2],@X[2] 1962 vpshufb $t2,@X[1],@X[1] 1963 vinserti128 \$1,48(%r12),@X[3],@X[3] 1964 vpshufb $t2,@X[2],@X[2] 1965 vinserti128 \$1,64(%r12),@X[4],@X[4] 1966 vpshufb $t2,@X[3],@X[3] 1967 vinserti128 \$1,80(%r12),@X[5],@X[5] 1968 vpshufb $t2,@X[4],@X[4] 1969 vinserti128 \$1,96(%r12),@X[6],@X[6] 1970 vpshufb $t2,@X[5],@X[5] 1971 vinserti128 \$1,112(%r12),@X[7],@X[7] 1972 1973 vpaddq -0x80($Tbl),@X[0],$t0 1974 vpshufb $t2,@X[6],@X[6] 1975 vpaddq -0x60($Tbl),@X[1],$t1 1976 vpshufb $t2,@X[7],@X[7] 1977 vpaddq -0x40($Tbl),@X[2],$t2 1978 vpaddq -0x20($Tbl),@X[3],$t3 1979 vmovdqa $t0,0x00(%rsp) 1980 vpaddq 0x00($Tbl),@X[4],$t0 1981 vmovdqa $t1,0x20(%rsp) 1982 vpaddq 0x20($Tbl),@X[5],$t1 1983 vmovdqa $t2,0x40(%rsp) 1984 vpaddq 0x40($Tbl),@X[6],$t2 1985 vmovdqa $t3,0x60(%rsp) 1986 lea -$PUSH8(%rsp),%rsp 1987 vpaddq 0x60($Tbl),@X[7],$t3 1988 vmovdqa $t0,0x00(%rsp) 1989 xor $a1,$a1 1990 vmovdqa $t1,0x20(%rsp) 1991 mov $B,$a3 1992 vmovdqa $t2,0x40(%rsp) 1993 xor $C,$a3 # magic 1994 vmovdqa $t3,0x60(%rsp) 1995 mov $F,$a4 1996 add \$16*2*$SZ,$Tbl 1997 jmp .Lavx2_00_47 1998 1999 .align 16 2000 .Lavx2_00_47: 2001 ___ 2002 2003 sub AVX2_512_00_47 () { 2004 my $j = shift; 2005 my $body = shift; 2006 my @X = @_; 2007 my @insns = (&$body,&$body); # 48 instructions 2008 my $base = "+2*$PUSH8(%rsp)"; 2009 2010 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 2011 foreach (Xupdate_512_AVX()) { # 23 instructions 2012 eval; 2013 if ($_ !~ /\;$/) { 2014 eval(shift(@insns)); 2015 eval(shift(@insns)); 2016 eval(shift(@insns)); 2017 } 2018 } 2019 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2020 foreach (@insns) { eval; } # remaining instructions 2021 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2022 } 2023 2024 for ($i=0,$j=0; $j<8; $j++) { 2025 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2026 push(@X,shift(@X)); # rotate(@X) 2027 } 2028 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2029 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2030 &jne (".Lavx2_00_47"); 2031 2032 for ($i=0; $i<16; ) { 2033 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2034 foreach(bodyx_00_15()) { eval; } 2035 } 2036 } 2037 $code.=<<___; 2038 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2039 add $a1,$A 2040 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2041 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2042 2043 add $SZ*0($ctx),$A 2044 add $SZ*1($ctx),$B 2045 add $SZ*2($ctx),$C 2046 add $SZ*3($ctx),$D 2047 add $SZ*4($ctx),$E 2048 add $SZ*5($ctx),$F 2049 add $SZ*6($ctx),$G 2050 add $SZ*7($ctx),$H 2051 2052 mov $A,$SZ*0($ctx) 2053 mov $B,$SZ*1($ctx) 2054 mov $C,$SZ*2($ctx) 2055 mov $D,$SZ*3($ctx) 2056 mov $E,$SZ*4($ctx) 2057 mov $F,$SZ*5($ctx) 2058 mov $G,$SZ*6($ctx) 2059 mov $H,$SZ*7($ctx) 2060 2061 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2062 je .Ldone_avx2 2063 2064 xor $a1,$a1 2065 mov $B,$a3 2066 xor $C,$a3 # magic 2067 mov $F,$a4 2068 jmp .Lower_avx2 2069 .align 16 2070 .Lower_avx2: 2071 ___ 2072 for ($i=0; $i<8; ) { 2073 my $base="+16($Tbl)"; 2074 foreach(bodyx_00_15()) { eval; } 2075 } 2076 $code.=<<___; 2077 lea -$PUSH8($Tbl),$Tbl 2078 cmp %rsp,$Tbl 2079 jae .Lower_avx2 2080 2081 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2082 add $a1,$A 2083 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2084 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2085 2086 add $SZ*0($ctx),$A 2087 add $SZ*1($ctx),$B 2088 add $SZ*2($ctx),$C 2089 add $SZ*3($ctx),$D 2090 add $SZ*4($ctx),$E 2091 add $SZ*5($ctx),$F 2092 lea `2*16*$SZ`($inp),$inp # inp+=2 2093 add $SZ*6($ctx),$G 2094 mov $inp,%r12 2095 add $SZ*7($ctx),$H 2096 cmp $_end,$inp 2097 2098 mov $A,$SZ*0($ctx) 2099 cmove %rsp,%r12 # next block or stale data 2100 mov $B,$SZ*1($ctx) 2101 mov $C,$SZ*2($ctx) 2102 mov $D,$SZ*3($ctx) 2103 mov $E,$SZ*4($ctx) 2104 mov $F,$SZ*5($ctx) 2105 mov $G,$SZ*6($ctx) 2106 mov $H,$SZ*7($ctx) 2107 2108 jbe .Loop_avx2 2109 lea (%rsp),$Tbl 2110 2111 .Ldone_avx2: 2112 lea ($Tbl),%rsp 2113 mov $_rsp,%rsi 2114 vzeroupper 2115 ___ 2116 $code.=<<___ if ($win64); 2117 movaps 16*$SZ+32(%rsp),%xmm6 2118 movaps 16*$SZ+48(%rsp),%xmm7 2119 movaps 16*$SZ+64(%rsp),%xmm8 2120 movaps 16*$SZ+80(%rsp),%xmm9 2121 ___ 2122 $code.=<<___ if ($win64 && $SZ>4); 2123 movaps 16*$SZ+96(%rsp),%xmm10 2124 movaps 16*$SZ+112(%rsp),%xmm11 2125 ___ 2126 $code.=<<___; 2127 mov (%rsi),%r15 2128 mov 8(%rsi),%r14 2129 mov 16(%rsi),%r13 2130 mov 24(%rsi),%r12 2131 mov 32(%rsi),%rbp 2132 mov 40(%rsi),%rbx 2133 lea 48(%rsi),%rsp 2134 .Lepilogue_avx2: 2135 ret 2136 .size ${func}_avx2,.-${func}_avx2 2137 ___ 2138 }} 2139 }}}}} 2140 2141 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2142 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 2143 if ($win64) { 2144 $rec="%rcx"; 2145 $frame="%rdx"; 2146 $context="%r8"; 2147 $disp="%r9"; 2148 2149 $code.=<<___; 2150 .extern __imp_RtlVirtualUnwind 2151 .type se_handler,\@abi-omnipotent 2152 .align 16 2153 se_handler: 2154 push %rsi 2155 push %rdi 2156 push %rbx 2157 push %rbp 2158 push %r12 2159 push %r13 2160 push %r14 2161 push %r15 2162 pushfq 2163 sub \$64,%rsp 2164 2165 mov 120($context),%rax # pull context->Rax 2166 mov 248($context),%rbx # pull context->Rip 2167 2168 mov 8($disp),%rsi # disp->ImageBase 2169 mov 56($disp),%r11 # disp->HanderlData 2170 2171 mov 0(%r11),%r10d # HandlerData[0] 2172 lea (%rsi,%r10),%r10 # prologue label 2173 cmp %r10,%rbx # context->Rip<prologue label 2174 jb .Lin_prologue 2175 2176 mov 152($context),%rax # pull context->Rsp 2177 2178 mov 4(%r11),%r10d # HandlerData[1] 2179 lea (%rsi,%r10),%r10 # epilogue label 2180 cmp %r10,%rbx # context->Rip>=epilogue label 2181 jae .Lin_prologue 2182 ___ 2183 $code.=<<___ if ($avx>1); 2184 lea .Lavx2_shortcut(%rip),%r10 2185 cmp %r10,%rbx # context->Rip<avx2_shortcut 2186 jb .Lnot_in_avx2 2187 2188 and \$-256*$SZ,%rax 2189 add \$`2*$SZ*($rounds-8)`,%rax 2190 .Lnot_in_avx2: 2191 ___ 2192 $code.=<<___; 2193 mov %rax,%rsi # put aside Rsp 2194 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2195 lea 48(%rax),%rax 2196 2197 mov -8(%rax),%rbx 2198 mov -16(%rax),%rbp 2199 mov -24(%rax),%r12 2200 mov -32(%rax),%r13 2201 mov -40(%rax),%r14 2202 mov -48(%rax),%r15 2203 mov %rbx,144($context) # restore context->Rbx 2204 mov %rbp,160($context) # restore context->Rbp 2205 mov %r12,216($context) # restore context->R12 2206 mov %r13,224($context) # restore context->R13 2207 mov %r14,232($context) # restore context->R14 2208 mov %r15,240($context) # restore context->R15 2209 2210 lea .Lepilogue(%rip),%r10 2211 cmp %r10,%rbx 2212 jb .Lin_prologue # non-AVX code 2213 2214 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2215 lea 512($context),%rdi # &context.Xmm6 2216 mov \$`$SZ==4?8:12`,%ecx 2217 .long 0xa548f3fc # cld; rep movsq 2218 2219 .Lin_prologue: 2220 mov 8(%rax),%rdi 2221 mov 16(%rax),%rsi 2222 mov %rax,152($context) # restore context->Rsp 2223 mov %rsi,168($context) # restore context->Rsi 2224 mov %rdi,176($context) # restore context->Rdi 2225 2226 mov 40($disp),%rdi # disp->ContextRecord 2227 mov $context,%rsi # context 2228 mov \$154,%ecx # sizeof(CONTEXT) 2229 .long 0xa548f3fc # cld; rep movsq 2230 2231 mov $disp,%rsi 2232 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2233 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2234 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2235 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2236 mov 40(%rsi),%r10 # disp->ContextRecord 2237 lea 56(%rsi),%r11 # &disp->HandlerData 2238 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2239 mov %r10,32(%rsp) # arg5 2240 mov %r11,40(%rsp) # arg6 2241 mov %r12,48(%rsp) # arg7 2242 mov %rcx,56(%rsp) # arg8, (NULL) 2243 call *__imp_RtlVirtualUnwind(%rip) 2244 2245 mov \$1,%eax # ExceptionContinueSearch 2246 add \$64,%rsp 2247 popfq 2248 pop %r15 2249 pop %r14 2250 pop %r13 2251 pop %r12 2252 pop %rbp 2253 pop %rbx 2254 pop %rdi 2255 pop %rsi 2256 ret 2257 .size se_handler,.-se_handler 2258 ___ 2259 2260 $code.=<<___ if ($SZ==4 && $shaext); 2261 .type shaext_handler,\@abi-omnipotent 2262 .align 16 2263 shaext_handler: 2264 push %rsi 2265 push %rdi 2266 push %rbx 2267 push %rbp 2268 push %r12 2269 push %r13 2270 push %r14 2271 push %r15 2272 pushfq 2273 sub \$64,%rsp 2274 2275 mov 120($context),%rax # pull context->Rax 2276 mov 248($context),%rbx # pull context->Rip 2277 2278 lea .Lprologue_shaext(%rip),%r10 2279 cmp %r10,%rbx # context->Rip<.Lprologue 2280 jb .Lin_prologue 2281 2282 lea .Lepilogue_shaext(%rip),%r10 2283 cmp %r10,%rbx # context->Rip>=.Lepilogue 2284 jae .Lin_prologue 2285 2286 lea -8-5*16(%rax),%rsi 2287 lea 512($context),%rdi # &context.Xmm6 2288 mov \$10,%ecx 2289 .long 0xa548f3fc # cld; rep movsq 2290 2291 jmp .Lin_prologue 2292 .size shaext_handler,.-shaext_handler 2293 ___ 2294 2295 $code.=<<___; 2296 .section .pdata 2297 .align 4 2298 .rva .LSEH_begin_$func 2299 .rva .LSEH_end_$func 2300 .rva .LSEH_info_$func 2301 ___ 2302 $code.=<<___ if ($SZ==4 && $shaext); 2303 .rva .LSEH_begin_${func}_shaext 2304 .rva .LSEH_end_${func}_shaext 2305 .rva .LSEH_info_${func}_shaext 2306 ___ 2307 $code.=<<___ if ($SZ==4); 2308 .rva .LSEH_begin_${func}_ssse3 2309 .rva .LSEH_end_${func}_ssse3 2310 .rva .LSEH_info_${func}_ssse3 2311 ___ 2312 $code.=<<___ if ($avx && $SZ==8); 2313 .rva .LSEH_begin_${func}_xop 2314 .rva .LSEH_end_${func}_xop 2315 .rva .LSEH_info_${func}_xop 2316 ___ 2317 $code.=<<___ if ($avx); 2318 .rva .LSEH_begin_${func}_avx 2319 .rva .LSEH_end_${func}_avx 2320 .rva .LSEH_info_${func}_avx 2321 ___ 2322 $code.=<<___ if ($avx>1); 2323 .rva .LSEH_begin_${func}_avx2 2324 .rva .LSEH_end_${func}_avx2 2325 .rva .LSEH_info_${func}_avx2 2326 ___ 2327 $code.=<<___; 2328 .section .xdata 2329 .align 8 2330 .LSEH_info_$func: 2331 .byte 9,0,0,0 2332 .rva se_handler 2333 .rva .Lprologue,.Lepilogue # HandlerData[] 2334 ___ 2335 $code.=<<___ if ($SZ==4 && $shaext); 2336 .LSEH_info_${func}_shaext: 2337 .byte 9,0,0,0 2338 .rva shaext_handler 2339 ___ 2340 $code.=<<___ if ($SZ==4); 2341 .LSEH_info_${func}_ssse3: 2342 .byte 9,0,0,0 2343 .rva se_handler 2344 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2345 ___ 2346 $code.=<<___ if ($avx && $SZ==8); 2347 .LSEH_info_${func}_xop: 2348 .byte 9,0,0,0 2349 .rva se_handler 2350 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2351 ___ 2352 $code.=<<___ if ($avx); 2353 .LSEH_info_${func}_avx: 2354 .byte 9,0,0,0 2355 .rva se_handler 2356 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2357 ___ 2358 $code.=<<___ if ($avx>1); 2359 .LSEH_info_${func}_avx2: 2360 .byte 9,0,0,0 2361 .rva se_handler 2362 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2363 ___ 2364 } 2365 2366 sub sha256op38 { 2367 my $instr = shift; 2368 my %opcodelet = ( 2369 "sha256rnds2" => 0xcb, 2370 "sha256msg1" => 0xcc, 2371 "sha256msg2" => 0xcd ); 2372 2373 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2374 my @opcode=(0x0f,0x38); 2375 push @opcode,$opcodelet{$instr}; 2376 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2377 return ".byte\t".join(',',@opcode); 2378 } else { 2379 return $instr."\t".@_[0]; 2380 } 2381 } 2382 2383 foreach (split("\n",$code)) { 2384 s/\`([^\`]*)\`/eval $1/geo; 2385 2386 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2387 2388 print $_,"\n"; 2389 } 2390 close STDOUT; 2391