1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # SHA256 block transform for x86. September 2007. 11 # 12 # Performance improvement over compiler generated code varies from 13 # 10% to 40% [see below]. Not very impressive on some -archs, but 14 # it's 5 times smaller and optimizies amount of writes. 15 # 16 # May 2012. 17 # 18 # Optimization including two of Pavel Semjanov's ideas, alternative 19 # Maj and full unroll, resulted in ~20-25% improvement on most CPUs, 20 # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost 21 # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not 22 # on P4, where it kills performance, nor Sandy Bridge, where folded 23 # loop is approximately as fast... 24 # 25 # June 2012. 26 # 27 # Add AMD XOP-specific code path, >30% improvement on Bulldozer over 28 # May version, >60% over original. Add AVX+shrd code path, >25% 29 # improvement on Sandy Bridge over May version, 60% over original. 30 # 31 # May 2013. 32 # 33 # Replace AMD XOP code path with SSSE3 to cover more processors. 34 # (Biggest improvement coefficient is on upcoming Atom Silvermont, 35 # not shown.) Add AVX+BMI code path. 36 # 37 # March 2014. 38 # 39 # Add support for Intel SHA Extensions. 40 # 41 # Performance in clock cycles per processed byte (less is better): 42 # 43 # gcc icc x86 asm(*) SIMD x86_64 asm(**) 44 # Pentium 46 57 40/38 - - 45 # PIII 36 33 27/24 - - 46 # P4 41 38 28 - 17.3 47 # AMD K8 27 25 19/15.5 - 14.9 48 # Core2 26 23 18/15.6 14.3 13.8 49 # Westmere 27 - 19/15.7 13.4 12.3 50 # Sandy Bridge 25 - 15.9 12.4 11.6 51 # Ivy Bridge 24 - 15.0 11.4 10.3 52 # Haswell 22 - 13.9 9.46 7.80 53 # Bulldozer 36 - 27/22 17.0 13.6 54 # VIA Nano 36 - 25/22 16.8 16.5 55 # Atom 50 - 30/25 21.9 18.9 56 # Silvermont 40 - 34/31 22.9 20.6 57 # 58 # (*) numbers after slash are for unrolled loop, where applicable; 59 # (**) x86_64 assembly performance is presented for reference 60 # purposes, results are best-available; 61 62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63 push(@INC,"${dir}","${dir}../../perlasm"); 64 require "x86asm.pl"; 65 66 &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); 67 68 $xmm=$avx=0; 69 for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 70 71 # In upstream, this is controlled by shelling out to the compiler to check 72 # versions, but BoringSSL is intended to be used with pre-generated perlasm 73 # output, so this isn't useful anyway. 74 # 75 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. 76 $avx = 1; 77 78 $avx = 0 unless ($xmm); 79 80 $shaext=$xmm; ### set to zero if compiling for 1.0.1 81 82 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 83 # been tested. 84 $shaext = 0; 85 86 $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of 87 # fully unrolled loop was measured to run about 88 # 3-4x slower. If slowdown coefficient is N and 89 # unrolled loop is m times faster, then you break 90 # even at (N-1)/(m-1) blocks. Then it needs to be 91 # adjusted for probability of code being evicted, 92 # code size/cache size=1/4. Typical m is 1.15... 93 94 $A="eax"; 95 $E="edx"; 96 $T="ebx"; 97 $Aoff=&DWP(4,"esp"); 98 $Boff=&DWP(8,"esp"); 99 $Coff=&DWP(12,"esp"); 100 $Doff=&DWP(16,"esp"); 101 $Eoff=&DWP(20,"esp"); 102 $Foff=&DWP(24,"esp"); 103 $Goff=&DWP(28,"esp"); 104 $Hoff=&DWP(32,"esp"); 105 $Xoff=&DWP(36,"esp"); 106 $K256="ebp"; 107 108 sub BODY_16_63() { 109 &mov ($T,"ecx"); # "ecx" is preloaded 110 &mov ("esi",&DWP(4*(9+15+16-14),"esp")); 111 &ror ("ecx",18-7); 112 &mov ("edi","esi"); 113 &ror ("esi",19-17); 114 &xor ("ecx",$T); 115 &shr ($T,3); 116 &ror ("ecx",7); 117 &xor ("esi","edi"); 118 &xor ($T,"ecx"); # T = sigma0(X[-15]) 119 &ror ("esi",17); 120 &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] 121 &shr ("edi",10); 122 &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] 123 #&xor ("edi","esi") # sigma1(X[-2]) 124 # &add ($T,"edi"); # T += sigma1(X[-2]) 125 # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 126 127 &BODY_00_15(1); 128 } 129 sub BODY_00_15() { 130 my $in_16_63=shift; 131 132 &mov ("ecx",$E); 133 &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) 134 &mov ("esi",$Foff); 135 &ror ("ecx",25-11); 136 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) 137 &mov ("edi",$Goff); 138 &xor ("ecx",$E); 139 &xor ("esi","edi"); 140 &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); 141 &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] 142 &ror ("ecx",11-6); 143 &and ("esi",$E); 144 &mov ($Eoff,$E); # modulo-scheduled 145 &xor ($E,"ecx"); 146 &add ($T,$Hoff); # T += h 147 &xor ("esi","edi"); # Ch(e,f,g) 148 &ror ($E,6); # Sigma1(e) 149 &mov ("ecx",$A); 150 &add ($T,"esi"); # T += Ch(e,f,g) 151 152 &ror ("ecx",22-13); 153 &add ($T,$E); # T += Sigma1(e) 154 &mov ("edi",$Boff); 155 &xor ("ecx",$A); 156 &mov ($Aoff,$A); # modulo-scheduled 157 &lea ("esp",&DWP(-4,"esp")); 158 &ror ("ecx",13-2); 159 &mov ("esi",&DWP(0,$K256)); 160 &xor ("ecx",$A); 161 &mov ($E,$Eoff); # e in next iteration, d in this one 162 &xor ($A,"edi"); # a ^= b 163 &ror ("ecx",2); # Sigma0(a) 164 165 &add ($T,"esi"); # T+= K[i] 166 &mov (&DWP(0,"esp"),$A); # (b^c) in next round 167 &add ($E,$T); # d += T 168 &and ($A,&DWP(4,"esp")); # a &= (b^c) 169 &add ($T,"ecx"); # T += Sigma0(a) 170 &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 171 &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T 172 &add ($K256,4); 173 &add ($A,$T); # h += T 174 } 175 176 &external_label("OPENSSL_ia32cap_P") if (!$i386); 177 178 &function_begin("sha256_block_data_order"); 179 &mov ("esi",wparam(0)); # ctx 180 &mov ("edi",wparam(1)); # inp 181 &mov ("eax",wparam(2)); # num 182 &mov ("ebx","esp"); # saved sp 183 184 &call (&label("pic_point")); # make it PIC! 185 &set_label("pic_point"); 186 &blindpop($K256); 187 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); 188 189 &sub ("esp",16); 190 &and ("esp",-64); 191 192 &shl ("eax",6); 193 &add ("eax","edi"); 194 &mov (&DWP(0,"esp"),"esi"); # ctx 195 &mov (&DWP(4,"esp"),"edi"); # inp 196 &mov (&DWP(8,"esp"),"eax"); # inp+num*128 197 &mov (&DWP(12,"esp"),"ebx"); # saved sp 198 if (!$i386 && $xmm) { 199 &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); 200 &mov ("ecx",&DWP(0,"edx")); 201 &mov ("ebx",&DWP(4,"edx")); 202 &test ("ecx",1<<20); # check for P4 203 &jnz (&label("loop")); 204 &mov ("edx",&DWP(8,"edx")) if ($xmm); 205 &test ("ecx",1<<24); # check for FXSR 206 &jz ($unroll_after?&label("no_xmm"):&label("loop")); 207 &and ("ecx",1<<30); # mask "Intel CPU" bit 208 &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits 209 &test ("edx",1<<29) if ($shaext); # check for SHA 210 &jnz (&label("shaext")) if ($shaext); 211 &or ("ecx","ebx"); 212 &and ("ecx",1<<28|1<<30); 213 &cmp ("ecx",1<<28|1<<30); 214 if ($xmm) { 215 &je (&label("AVX")) if ($avx); 216 &test ("ebx",1<<9); # check for SSSE3 217 &jnz (&label("SSSE3")); 218 } else { 219 &je (&label("loop_shrd")); 220 } 221 if ($unroll_after) { 222 &set_label("no_xmm"); 223 &sub ("eax","edi"); 224 &cmp ("eax",$unroll_after); 225 &jae (&label("unrolled")); 226 } } 227 &jmp (&label("loop")); 228 229 sub COMPACT_LOOP() { 230 my $suffix=shift; 231 232 &set_label("loop$suffix",$suffix?32:16); 233 # copy input block to stack reversing byte and dword order 234 for($i=0;$i<4;$i++) { 235 &mov ("eax",&DWP($i*16+0,"edi")); 236 &mov ("ebx",&DWP($i*16+4,"edi")); 237 &mov ("ecx",&DWP($i*16+8,"edi")); 238 &bswap ("eax"); 239 &mov ("edx",&DWP($i*16+12,"edi")); 240 &bswap ("ebx"); 241 &push ("eax"); 242 &bswap ("ecx"); 243 &push ("ebx"); 244 &bswap ("edx"); 245 &push ("ecx"); 246 &push ("edx"); 247 } 248 &add ("edi",64); 249 &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H 250 &mov (&DWP(4*(9+16)+4,"esp"),"edi"); 251 252 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 253 &mov ($A,&DWP(0,"esi")); 254 &mov ("ebx",&DWP(4,"esi")); 255 &mov ("ecx",&DWP(8,"esi")); 256 &mov ("edi",&DWP(12,"esi")); 257 # &mov ($Aoff,$A); 258 &mov ($Boff,"ebx"); 259 &xor ("ebx","ecx"); 260 &mov ($Coff,"ecx"); 261 &mov ($Doff,"edi"); 262 &mov (&DWP(0,"esp"),"ebx"); # magic 263 &mov ($E,&DWP(16,"esi")); 264 &mov ("ebx",&DWP(20,"esi")); 265 &mov ("ecx",&DWP(24,"esi")); 266 &mov ("edi",&DWP(28,"esi")); 267 # &mov ($Eoff,$E); 268 &mov ($Foff,"ebx"); 269 &mov ($Goff,"ecx"); 270 &mov ($Hoff,"edi"); 271 272 &set_label("00_15$suffix",16); 273 274 &BODY_00_15(); 275 276 &cmp ("esi",0xc19bf174); 277 &jne (&label("00_15$suffix")); 278 279 &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) 280 &jmp (&label("16_63$suffix")); 281 282 &set_label("16_63$suffix",16); 283 284 &BODY_16_63(); 285 286 &cmp ("esi",0xc67178f2); 287 &jne (&label("16_63$suffix")); 288 289 &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx 290 # &mov ($A,$Aoff); 291 &mov ("ebx",$Boff); 292 # &mov ("edi",$Coff); 293 &mov ("ecx",$Doff); 294 &add ($A,&DWP(0,"esi")); 295 &add ("ebx",&DWP(4,"esi")); 296 &add ("edi",&DWP(8,"esi")); 297 &add ("ecx",&DWP(12,"esi")); 298 &mov (&DWP(0,"esi"),$A); 299 &mov (&DWP(4,"esi"),"ebx"); 300 &mov (&DWP(8,"esi"),"edi"); 301 &mov (&DWP(12,"esi"),"ecx"); 302 # &mov ($E,$Eoff); 303 &mov ("eax",$Foff); 304 &mov ("ebx",$Goff); 305 &mov ("ecx",$Hoff); 306 &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp 307 &add ($E,&DWP(16,"esi")); 308 &add ("eax",&DWP(20,"esi")); 309 &add ("ebx",&DWP(24,"esi")); 310 &add ("ecx",&DWP(28,"esi")); 311 &mov (&DWP(16,"esi"),$E); 312 &mov (&DWP(20,"esi"),"eax"); 313 &mov (&DWP(24,"esi"),"ebx"); 314 &mov (&DWP(28,"esi"),"ecx"); 315 316 &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame 317 &sub ($K256,4*64); # rewind K 318 319 &cmp ("edi",&DWP(8,"esp")); # are we done yet? 320 &jb (&label("loop$suffix")); 321 } 322 &COMPACT_LOOP(); 323 &mov ("esp",&DWP(12,"esp")); # restore sp 324 &function_end_A(); 325 if (!$i386 && !$xmm) { 326 # ~20% improvement on Sandy Bridge 327 local *ror = sub { &shrd(@_[0],@_) }; 328 &COMPACT_LOOP("_shrd"); 329 &mov ("esp",&DWP(12,"esp")); # restore sp 330 &function_end_A(); 331 } 332 333 &set_label("K256",64); # Yes! I keep it in the code segment! 334 @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 335 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 336 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 337 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 338 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 339 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 340 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 341 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 342 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 343 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 344 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 345 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 346 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 347 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 348 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 349 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 350 &data_word(@K256); 351 &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask 352 &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 353 354 ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets 355 sub off { &DWP(4*(((shift)-$i)&7),"esp"); } 356 357 if (!$i386 && $unroll_after) { 358 my @AH=($A,$K256); 359 360 &set_label("unrolled",16); 361 &lea ("esp",&DWP(-96,"esp")); 362 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 363 &mov ($AH[0],&DWP(0,"esi")); 364 &mov ($AH[1],&DWP(4,"esi")); 365 &mov ("ecx",&DWP(8,"esi")); 366 &mov ("ebx",&DWP(12,"esi")); 367 #&mov (&DWP(0,"esp"),$AH[0]); 368 &mov (&DWP(4,"esp"),$AH[1]); 369 &xor ($AH[1],"ecx"); # magic 370 &mov (&DWP(8,"esp"),"ecx"); 371 &mov (&DWP(12,"esp"),"ebx"); 372 &mov ($E,&DWP(16,"esi")); 373 &mov ("ebx",&DWP(20,"esi")); 374 &mov ("ecx",&DWP(24,"esi")); 375 &mov ("esi",&DWP(28,"esi")); 376 #&mov (&DWP(16,"esp"),$E); 377 &mov (&DWP(20,"esp"),"ebx"); 378 &mov (&DWP(24,"esp"),"ecx"); 379 &mov (&DWP(28,"esp"),"esi"); 380 &jmp (&label("grand_loop")); 381 382 &set_label("grand_loop",16); 383 # copy input block to stack reversing byte order 384 for($i=0;$i<5;$i++) { 385 &mov ("ebx",&DWP(12*$i+0,"edi")); 386 &mov ("ecx",&DWP(12*$i+4,"edi")); 387 &bswap ("ebx"); 388 &mov ("esi",&DWP(12*$i+8,"edi")); 389 &bswap ("ecx"); 390 &mov (&DWP(32+12*$i+0,"esp"),"ebx"); 391 &bswap ("esi"); 392 &mov (&DWP(32+12*$i+4,"esp"),"ecx"); 393 &mov (&DWP(32+12*$i+8,"esp"),"esi"); 394 } 395 &mov ("ebx",&DWP($i*12,"edi")); 396 &add ("edi",64); 397 &bswap ("ebx"); 398 &mov (&DWP(96+4,"esp"),"edi"); 399 &mov (&DWP(32+12*$i,"esp"),"ebx"); 400 401 my ($t1,$t2) = ("ecx","esi"); 402 403 for ($i=0;$i<64;$i++) { 404 405 if ($i>=16) { 406 &mov ($T,$t1); # $t1 is preloaded 407 # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); 408 &ror ($t1,18-7); 409 &mov ("edi",$t2); 410 &ror ($t2,19-17); 411 &xor ($t1,$T); 412 &shr ($T,3); 413 &ror ($t1,7); 414 &xor ($t2,"edi"); 415 &xor ($T,$t1); # T = sigma0(X[-15]) 416 &ror ($t2,17); 417 &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] 418 &shr ("edi",10); 419 &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] 420 #&xor ("edi",$t2) # sigma1(X[-2]) 421 # &add ($T,"edi"); # T += sigma1(X[-2]) 422 # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 423 } 424 &mov ($t1,$E); 425 &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) 426 &mov ($t2,&off($f)); 427 &ror ($E,25-11); 428 &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) 429 &mov ("edi",&off($g)); 430 &xor ($E,$t1); 431 &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] 432 &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] 433 &xor ($t2,"edi"); 434 &ror ($E,11-6); 435 &and ($t2,$t1); 436 &mov (&off($e),$t1); # save $E, modulo-scheduled 437 &xor ($E,$t1); 438 &add ($T,&off($h)); # T += h 439 &xor ("edi",$t2); # Ch(e,f,g) 440 &ror ($E,6); # Sigma1(e) 441 &mov ($t1,$AH[0]); 442 &add ($T,"edi"); # T += Ch(e,f,g) 443 444 &ror ($t1,22-13); 445 &mov ($t2,$AH[0]); 446 &mov ("edi",&off($b)); 447 &xor ($t1,$AH[0]); 448 &mov (&off($a),$AH[0]); # save $A, modulo-scheduled 449 &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round 450 &ror ($t1,13-2); 451 &and ($AH[1],$AH[0]); # (b^c) &= (a^b) 452 &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] 453 &xor ($t1,$t2); 454 &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 455 &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); 456 &ror ($t1,2); # Sigma0(a) 457 458 &add ($AH[1],$E); # h += T 459 &add ($E,&off($d)); # d += T 460 &add ($AH[1],$t1); # h += Sigma0(a) 461 &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); 462 463 @AH = reverse(@AH); # rotate(a,h) 464 ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) 465 } 466 &mov ("esi",&DWP(96,"esp")); #ctx 467 #&mov ($AH[0],&DWP(0,"esp")); 468 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 469 #&mov ("edi", &DWP(8,"esp")); 470 &mov ("ecx",&DWP(12,"esp")); 471 &add ($AH[0],&DWP(0,"esi")); 472 &add ($AH[1],&DWP(4,"esi")); 473 &add ("edi",&DWP(8,"esi")); 474 &add ("ecx",&DWP(12,"esi")); 475 &mov (&DWP(0,"esi"),$AH[0]); 476 &mov (&DWP(4,"esi"),$AH[1]); 477 &mov (&DWP(8,"esi"),"edi"); 478 &mov (&DWP(12,"esi"),"ecx"); 479 #&mov (&DWP(0,"esp"),$AH[0]); 480 &mov (&DWP(4,"esp"),$AH[1]); 481 &xor ($AH[1],"edi"); # magic 482 &mov (&DWP(8,"esp"),"edi"); 483 &mov (&DWP(12,"esp"),"ecx"); 484 #&mov ($E,&DWP(16,"esp")); 485 &mov ("edi",&DWP(20,"esp")); 486 &mov ("ebx",&DWP(24,"esp")); 487 &mov ("ecx",&DWP(28,"esp")); 488 &add ($E,&DWP(16,"esi")); 489 &add ("edi",&DWP(20,"esi")); 490 &add ("ebx",&DWP(24,"esi")); 491 &add ("ecx",&DWP(28,"esi")); 492 &mov (&DWP(16,"esi"),$E); 493 &mov (&DWP(20,"esi"),"edi"); 494 &mov (&DWP(24,"esi"),"ebx"); 495 &mov (&DWP(28,"esi"),"ecx"); 496 #&mov (&DWP(16,"esp"),$E); 497 &mov (&DWP(20,"esp"),"edi"); 498 &mov ("edi",&DWP(96+4,"esp")); # inp 499 &mov (&DWP(24,"esp"),"ebx"); 500 &mov (&DWP(28,"esp"),"ecx"); 501 502 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 503 &jb (&label("grand_loop")); 504 505 &mov ("esp",&DWP(96+12,"esp")); # restore sp 506 &function_end_A(); 507 } 508 if (!$i386 && $xmm) {{{ 509 if ($shaext) { 510 ###################################################################### 511 # Intel SHA Extensions implementation of SHA256 update function. 512 # 513 my ($ctx,$inp,$end)=("esi","edi","eax"); 514 my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); 515 my @MSG=map("xmm$_",(3..6)); 516 517 sub sha256op38 { 518 my ($opcodelet,$dst,$src)=@_; 519 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 520 { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } 521 } 522 sub sha256rnds2 { sha256op38(0xcb,@_); } 523 sub sha256msg1 { sha256op38(0xcc,@_); } 524 sub sha256msg2 { sha256op38(0xcd,@_); } 525 526 &set_label("shaext",32); 527 &sub ("esp",32); 528 529 &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA 530 &lea ($K256,&DWP(0x80,$K256)); 531 &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE 532 &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 533 534 &pshufd ($Wi,$ABEF,0x1b); # ABCD 535 &pshufd ($ABEF,$ABEF,0xb1); # CDAB 536 &pshufd ($CDGH,$CDGH,0x1b); # EFGH 537 &palignr ($ABEF,$CDGH,8); # ABEF 538 &punpcklqdq ($CDGH,$Wi); # CDGH 539 &jmp (&label("loop_shaext")); 540 541 &set_label("loop_shaext",16); 542 &movdqu (@MSG[0],&QWP(0,$inp)); 543 &movdqu (@MSG[1],&QWP(0x10,$inp)); 544 &movdqu (@MSG[2],&QWP(0x20,$inp)); 545 &pshufb (@MSG[0],$TMP); 546 &movdqu (@MSG[3],&QWP(0x30,$inp)); 547 &movdqa (&QWP(16,"esp"),$CDGH); # offload 548 549 &movdqa ($Wi,&QWP(0*16-0x80,$K256)); 550 &paddd ($Wi,@MSG[0]); 551 &pshufb (@MSG[1],$TMP); 552 &sha256rnds2 ($CDGH,$ABEF); # 0-3 553 &pshufd ($Wi,$Wi,0x0e); 554 &nop (); 555 &movdqa (&QWP(0,"esp"),$ABEF); # offload 556 &sha256rnds2 ($ABEF,$CDGH); 557 558 &movdqa ($Wi,&QWP(1*16-0x80,$K256)); 559 &paddd ($Wi,@MSG[1]); 560 &pshufb (@MSG[2],$TMP); 561 &sha256rnds2 ($CDGH,$ABEF); # 4-7 562 &pshufd ($Wi,$Wi,0x0e); 563 &lea ($inp,&DWP(0x40,$inp)); 564 &sha256msg1 (@MSG[0],@MSG[1]); 565 &sha256rnds2 ($ABEF,$CDGH); 566 567 &movdqa ($Wi,&QWP(2*16-0x80,$K256)); 568 &paddd ($Wi,@MSG[2]); 569 &pshufb (@MSG[3],$TMP); 570 &sha256rnds2 ($CDGH,$ABEF); # 8-11 571 &pshufd ($Wi,$Wi,0x0e); 572 &movdqa ($TMP,@MSG[3]); 573 &palignr ($TMP,@MSG[2],4); 574 &nop (); 575 &paddd (@MSG[0],$TMP); 576 &sha256msg1 (@MSG[1],@MSG[2]); 577 &sha256rnds2 ($ABEF,$CDGH); 578 579 &movdqa ($Wi,&QWP(3*16-0x80,$K256)); 580 &paddd ($Wi,@MSG[3]); 581 &sha256msg2 (@MSG[0],@MSG[3]); 582 &sha256rnds2 ($CDGH,$ABEF); # 12-15 583 &pshufd ($Wi,$Wi,0x0e); 584 &movdqa ($TMP,@MSG[0]); 585 &palignr ($TMP,@MSG[3],4); 586 &nop (); 587 &paddd (@MSG[1],$TMP); 588 &sha256msg1 (@MSG[2],@MSG[3]); 589 &sha256rnds2 ($ABEF,$CDGH); 590 591 for($i=4;$i<16-3;$i++) { 592 &movdqa ($Wi,&QWP($i*16-0x80,$K256)); 593 &paddd ($Wi,@MSG[0]); 594 &sha256msg2 (@MSG[1],@MSG[0]); 595 &sha256rnds2 ($CDGH,$ABEF); # 16-19... 596 &pshufd ($Wi,$Wi,0x0e); 597 &movdqa ($TMP,@MSG[1]); 598 &palignr ($TMP,@MSG[0],4); 599 &nop (); 600 &paddd (@MSG[2],$TMP); 601 &sha256msg1 (@MSG[3],@MSG[0]); 602 &sha256rnds2 ($ABEF,$CDGH); 603 604 push(@MSG,shift(@MSG)); 605 } 606 &movdqa ($Wi,&QWP(13*16-0x80,$K256)); 607 &paddd ($Wi,@MSG[0]); 608 &sha256msg2 (@MSG[1],@MSG[0]); 609 &sha256rnds2 ($CDGH,$ABEF); # 52-55 610 &pshufd ($Wi,$Wi,0x0e); 611 &movdqa ($TMP,@MSG[1]) 612 &palignr ($TMP,@MSG[0],4); 613 &sha256rnds2 ($ABEF,$CDGH); 614 &paddd (@MSG[2],$TMP); 615 616 &movdqa ($Wi,&QWP(14*16-0x80,$K256)); 617 &paddd ($Wi,@MSG[1]); 618 &sha256rnds2 ($CDGH,$ABEF); # 56-59 619 &pshufd ($Wi,$Wi,0x0e); 620 &sha256msg2 (@MSG[2],@MSG[1]); 621 &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 622 &sha256rnds2 ($ABEF,$CDGH); 623 624 &movdqa ($Wi,&QWP(15*16-0x80,$K256)); 625 &paddd ($Wi,@MSG[2]); 626 &nop (); 627 &sha256rnds2 ($CDGH,$ABEF); # 60-63 628 &pshufd ($Wi,$Wi,0x0e); 629 &cmp ($end,$inp); 630 &nop (); 631 &sha256rnds2 ($ABEF,$CDGH); 632 633 &paddd ($CDGH,&QWP(16,"esp")); 634 &paddd ($ABEF,&QWP(0,"esp")); 635 &jnz (&label("loop_shaext")); 636 637 &pshufd ($CDGH,$CDGH,0xb1); # DCHG 638 &pshufd ($TMP,$ABEF,0x1b); # FEBA 639 &pshufd ($ABEF,$ABEF,0xb1); # BAFE 640 &punpckhqdq ($ABEF,$CDGH); # DCBA 641 &palignr ($CDGH,$TMP,8); # HGFE 642 643 &mov ("esp",&DWP(32+12,"esp")); 644 &movdqu (&QWP(0,$ctx),$ABEF); 645 &movdqu (&QWP(16,$ctx),$CDGH); 646 &function_end_A(); 647 } 648 649 my @X = map("xmm$_",(0..3)); 650 my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); 651 my @AH = ($A,$T); 652 653 &set_label("SSSE3",32); 654 &lea ("esp",&DWP(-96,"esp")); 655 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 656 &mov ($AH[0],&DWP(0,"esi")); 657 &mov ($AH[1],&DWP(4,"esi")); 658 &mov ("ecx",&DWP(8,"esi")); 659 &mov ("edi",&DWP(12,"esi")); 660 #&mov (&DWP(0,"esp"),$AH[0]); 661 &mov (&DWP(4,"esp"),$AH[1]); 662 &xor ($AH[1],"ecx"); # magic 663 &mov (&DWP(8,"esp"),"ecx"); 664 &mov (&DWP(12,"esp"),"edi"); 665 &mov ($E,&DWP(16,"esi")); 666 &mov ("edi",&DWP(20,"esi")); 667 &mov ("ecx",&DWP(24,"esi")); 668 &mov ("esi",&DWP(28,"esi")); 669 #&mov (&DWP(16,"esp"),$E); 670 &mov (&DWP(20,"esp"),"edi"); 671 &mov ("edi",&DWP(96+4,"esp")); # inp 672 &mov (&DWP(24,"esp"),"ecx"); 673 &mov (&DWP(28,"esp"),"esi"); 674 &movdqa ($t3,&QWP(256,$K256)); 675 &jmp (&label("grand_ssse3")); 676 677 &set_label("grand_ssse3",16); 678 # load input, reverse byte order, add K256[0..15], save to stack 679 &movdqu (@X[0],&QWP(0,"edi")); 680 &movdqu (@X[1],&QWP(16,"edi")); 681 &movdqu (@X[2],&QWP(32,"edi")); 682 &movdqu (@X[3],&QWP(48,"edi")); 683 &add ("edi",64); 684 &pshufb (@X[0],$t3); 685 &mov (&DWP(96+4,"esp"),"edi"); 686 &pshufb (@X[1],$t3); 687 &movdqa ($t0,&QWP(0,$K256)); 688 &pshufb (@X[2],$t3); 689 &movdqa ($t1,&QWP(16,$K256)); 690 &paddd ($t0,@X[0]); 691 &pshufb (@X[3],$t3); 692 &movdqa ($t2,&QWP(32,$K256)); 693 &paddd ($t1,@X[1]); 694 &movdqa ($t3,&QWP(48,$K256)); 695 &movdqa (&QWP(32+0,"esp"),$t0); 696 &paddd ($t2,@X[2]); 697 &movdqa (&QWP(32+16,"esp"),$t1); 698 &paddd ($t3,@X[3]); 699 &movdqa (&QWP(32+32,"esp"),$t2); 700 &movdqa (&QWP(32+48,"esp"),$t3); 701 &jmp (&label("ssse3_00_47")); 702 703 &set_label("ssse3_00_47",16); 704 &add ($K256,64); 705 706 sub SSSE3_00_47 () { 707 my $j = shift; 708 my $body = shift; 709 my @X = @_; 710 my @insns = (&$body,&$body,&$body,&$body); # 120 instructions 711 712 eval(shift(@insns)); 713 &movdqa ($t0,@X[1]); 714 eval(shift(@insns)); # @ 715 eval(shift(@insns)); 716 &movdqa ($t3,@X[3]); 717 eval(shift(@insns)); 718 eval(shift(@insns)); 719 &palignr ($t0,@X[0],4); # X[1..4] 720 eval(shift(@insns)); 721 eval(shift(@insns)); # @ 722 eval(shift(@insns)); 723 &palignr ($t3,@X[2],4); # X[9..12] 724 eval(shift(@insns)); 725 eval(shift(@insns)); 726 eval(shift(@insns)); 727 &movdqa ($t1,$t0); 728 eval(shift(@insns)); # @ 729 eval(shift(@insns)); 730 &movdqa ($t2,$t0); 731 eval(shift(@insns)); 732 eval(shift(@insns)); 733 &psrld ($t0,3); 734 eval(shift(@insns)); 735 eval(shift(@insns)); # @ 736 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 737 eval(shift(@insns)); 738 eval(shift(@insns)); 739 &psrld ($t2,7); 740 eval(shift(@insns)); 741 eval(shift(@insns)); 742 eval(shift(@insns)); # @ 743 eval(shift(@insns)); 744 &pshufd ($t3,@X[3],0b11111010); # X[14..15] 745 eval(shift(@insns)); 746 eval(shift(@insns)); 747 &pslld ($t1,32-18); 748 eval(shift(@insns)); 749 eval(shift(@insns)); # @ 750 &pxor ($t0,$t2); 751 eval(shift(@insns)); 752 eval(shift(@insns)); 753 &psrld ($t2,18-7); 754 eval(shift(@insns)); 755 eval(shift(@insns)); 756 eval(shift(@insns)); # @ 757 &pxor ($t0,$t1); 758 eval(shift(@insns)); 759 eval(shift(@insns)); 760 &pslld ($t1,18-7); 761 eval(shift(@insns)); 762 eval(shift(@insns)); 763 eval(shift(@insns)); # @ 764 &pxor ($t0,$t2); 765 eval(shift(@insns)); 766 eval(shift(@insns)); 767 &movdqa ($t2,$t3); 768 eval(shift(@insns)); 769 eval(shift(@insns)); 770 eval(shift(@insns)); # @ 771 &pxor ($t0,$t1); # sigma0(X[1..4]) 772 eval(shift(@insns)); 773 eval(shift(@insns)); 774 &psrld ($t3,10); 775 eval(shift(@insns)); 776 eval(shift(@insns)); 777 eval(shift(@insns)); # @ 778 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 779 eval(shift(@insns)); 780 eval(shift(@insns)); 781 &psrlq ($t2,17); 782 eval(shift(@insns)); 783 eval(shift(@insns)); 784 eval(shift(@insns)); # @ 785 &pxor ($t3,$t2); 786 eval(shift(@insns)); 787 eval(shift(@insns)); 788 &psrlq ($t2,19-17); 789 eval(shift(@insns)); 790 eval(shift(@insns)); 791 eval(shift(@insns)); # @ 792 &pxor ($t3,$t2); 793 eval(shift(@insns)); 794 eval(shift(@insns)); 795 &pshufd ($t3,$t3,0b10000000); 796 eval(shift(@insns)); 797 eval(shift(@insns)); 798 eval(shift(@insns)); # @ 799 eval(shift(@insns)); 800 eval(shift(@insns)); 801 eval(shift(@insns)); 802 eval(shift(@insns)); 803 eval(shift(@insns)); # @ 804 eval(shift(@insns)); 805 &psrldq ($t3,8); 806 eval(shift(@insns)); 807 eval(shift(@insns)); 808 eval(shift(@insns)); 809 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 810 eval(shift(@insns)); # @ 811 eval(shift(@insns)); 812 eval(shift(@insns)); 813 eval(shift(@insns)); 814 eval(shift(@insns)); 815 eval(shift(@insns)); # @ 816 eval(shift(@insns)); 817 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 818 eval(shift(@insns)); 819 eval(shift(@insns)); 820 eval(shift(@insns)); 821 &movdqa ($t2,$t3); 822 eval(shift(@insns)); # @ 823 &psrld ($t3,10); 824 eval(shift(@insns)); 825 &psrlq ($t2,17); 826 eval(shift(@insns)); 827 eval(shift(@insns)); 828 eval(shift(@insns)); 829 eval(shift(@insns)); # @ 830 &pxor ($t3,$t2); 831 eval(shift(@insns)); 832 eval(shift(@insns)); 833 &psrlq ($t2,19-17); 834 eval(shift(@insns)); 835 eval(shift(@insns)); 836 eval(shift(@insns)); # @ 837 &pxor ($t3,$t2); 838 eval(shift(@insns)); 839 eval(shift(@insns)); 840 eval(shift(@insns)); 841 &pshufd ($t3,$t3,0b00001000); 842 eval(shift(@insns)); 843 eval(shift(@insns)); # @ 844 &movdqa ($t2,&QWP(16*$j,$K256)); 845 eval(shift(@insns)); 846 eval(shift(@insns)); 847 &pslldq ($t3,8); 848 eval(shift(@insns)); 849 eval(shift(@insns)); 850 eval(shift(@insns)); # @ 851 eval(shift(@insns)); 852 eval(shift(@insns)); 853 eval(shift(@insns)); 854 eval(shift(@insns)); 855 eval(shift(@insns)); # @ 856 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 857 eval(shift(@insns)); 858 eval(shift(@insns)); 859 eval(shift(@insns)); 860 eval(shift(@insns)); 861 &paddd ($t2,@X[0]); 862 eval(shift(@insns)); # @ 863 864 foreach (@insns) { eval; } # remaining instructions 865 866 &movdqa (&QWP(32+16*$j,"esp"),$t2); 867 } 868 869 sub body_00_15 () { 870 ( 871 '&mov ("ecx",$E);', 872 '&ror ($E,25-11);', 873 '&mov ("esi",&off($f));', 874 '&xor ($E,"ecx");', 875 '&mov ("edi",&off($g));', 876 '&xor ("esi","edi");', 877 '&ror ($E,11-6);', 878 '&and ("esi","ecx");', 879 '&mov (&off($e),"ecx");', # save $E, modulo-scheduled 880 '&xor ($E,"ecx");', 881 '&xor ("edi","esi");', # Ch(e,f,g) 882 '&ror ($E,6);', # T = Sigma1(e) 883 '&mov ("ecx",$AH[0]);', 884 '&add ($E,"edi");', # T += Ch(e,f,g) 885 '&mov ("edi",&off($b));', 886 '&mov ("esi",$AH[0]);', 887 888 '&ror ("ecx",22-13);', 889 '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 890 '&xor ("ecx",$AH[0]);', 891 '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round 892 '&add ($E,&off($h));', # T += h 893 '&ror ("ecx",13-2);', 894 '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) 895 '&xor ("ecx","esi");', 896 '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] 897 '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) 898 '&ror ("ecx",2);', # Sigma0(a) 899 900 '&add ($AH[1],$E);', # h += T 901 '&add ($E,&off($d));', # d += T 902 '&add ($AH[1],"ecx");'. # h += Sigma0(a) 903 904 '@AH = reverse(@AH); $i++;' # rotate(a,h) 905 ); 906 } 907 908 for ($i=0,$j=0; $j<4; $j++) { 909 &SSSE3_00_47($j,\&body_00_15,@X); 910 push(@X,shift(@X)); # rotate(@X) 911 } 912 &cmp (&DWP(16*$j,$K256),0x00010203); 913 &jne (&label("ssse3_00_47")); 914 915 for ($i=0; $i<16; ) { 916 foreach(body_00_15()) { eval; } 917 } 918 919 &mov ("esi",&DWP(96,"esp")); #ctx 920 #&mov ($AH[0],&DWP(0,"esp")); 921 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 922 #&mov ("edi", &DWP(8,"esp")); 923 &mov ("ecx",&DWP(12,"esp")); 924 &add ($AH[0],&DWP(0,"esi")); 925 &add ($AH[1],&DWP(4,"esi")); 926 &add ("edi",&DWP(8,"esi")); 927 &add ("ecx",&DWP(12,"esi")); 928 &mov (&DWP(0,"esi"),$AH[0]); 929 &mov (&DWP(4,"esi"),$AH[1]); 930 &mov (&DWP(8,"esi"),"edi"); 931 &mov (&DWP(12,"esi"),"ecx"); 932 #&mov (&DWP(0,"esp"),$AH[0]); 933 &mov (&DWP(4,"esp"),$AH[1]); 934 &xor ($AH[1],"edi"); # magic 935 &mov (&DWP(8,"esp"),"edi"); 936 &mov (&DWP(12,"esp"),"ecx"); 937 #&mov ($E,&DWP(16,"esp")); 938 &mov ("edi",&DWP(20,"esp")); 939 &mov ("ecx",&DWP(24,"esp")); 940 &add ($E,&DWP(16,"esi")); 941 &add ("edi",&DWP(20,"esi")); 942 &add ("ecx",&DWP(24,"esi")); 943 &mov (&DWP(16,"esi"),$E); 944 &mov (&DWP(20,"esi"),"edi"); 945 &mov (&DWP(20,"esp"),"edi"); 946 &mov ("edi",&DWP(28,"esp")); 947 &mov (&DWP(24,"esi"),"ecx"); 948 #&mov (&DWP(16,"esp"),$E); 949 &add ("edi",&DWP(28,"esi")); 950 &mov (&DWP(24,"esp"),"ecx"); 951 &mov (&DWP(28,"esi"),"edi"); 952 &mov (&DWP(28,"esp"),"edi"); 953 &mov ("edi",&DWP(96+4,"esp")); # inp 954 955 &movdqa ($t3,&QWP(64,$K256)); 956 &sub ($K256,3*64); # rewind K 957 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 958 &jb (&label("grand_ssse3")); 959 960 &mov ("esp",&DWP(96+12,"esp")); # restore sp 961 &function_end_A(); 962 if ($avx) { 963 &set_label("AVX",32); 964 if ($avx>1) { 965 &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 966 &cmp ("edx",1<<8|1<<3); 967 &je (&label("AVX_BMI")); 968 } 969 &lea ("esp",&DWP(-96,"esp")); 970 &vzeroall (); 971 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 972 &mov ($AH[0],&DWP(0,"esi")); 973 &mov ($AH[1],&DWP(4,"esi")); 974 &mov ("ecx",&DWP(8,"esi")); 975 &mov ("edi",&DWP(12,"esi")); 976 #&mov (&DWP(0,"esp"),$AH[0]); 977 &mov (&DWP(4,"esp"),$AH[1]); 978 &xor ($AH[1],"ecx"); # magic 979 &mov (&DWP(8,"esp"),"ecx"); 980 &mov (&DWP(12,"esp"),"edi"); 981 &mov ($E,&DWP(16,"esi")); 982 &mov ("edi",&DWP(20,"esi")); 983 &mov ("ecx",&DWP(24,"esi")); 984 &mov ("esi",&DWP(28,"esi")); 985 #&mov (&DWP(16,"esp"),$E); 986 &mov (&DWP(20,"esp"),"edi"); 987 &mov ("edi",&DWP(96+4,"esp")); # inp 988 &mov (&DWP(24,"esp"),"ecx"); 989 &mov (&DWP(28,"esp"),"esi"); 990 &vmovdqa ($t3,&QWP(256,$K256)); 991 &jmp (&label("grand_avx")); 992 993 &set_label("grand_avx",32); 994 # load input, reverse byte order, add K256[0..15], save to stack 995 &vmovdqu (@X[0],&QWP(0,"edi")); 996 &vmovdqu (@X[1],&QWP(16,"edi")); 997 &vmovdqu (@X[2],&QWP(32,"edi")); 998 &vmovdqu (@X[3],&QWP(48,"edi")); 999 &add ("edi",64); 1000 &vpshufb (@X[0],@X[0],$t3); 1001 &mov (&DWP(96+4,"esp"),"edi"); 1002 &vpshufb (@X[1],@X[1],$t3); 1003 &vpshufb (@X[2],@X[2],$t3); 1004 &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1005 &vpshufb (@X[3],@X[3],$t3); 1006 &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1007 &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1008 &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1009 &vmovdqa (&QWP(32+0,"esp"),$t0); 1010 &vmovdqa (&QWP(32+16,"esp"),$t1); 1011 &vmovdqa (&QWP(32+32,"esp"),$t2); 1012 &vmovdqa (&QWP(32+48,"esp"),$t3); 1013 &jmp (&label("avx_00_47")); 1014 1015 &set_label("avx_00_47",16); 1016 &add ($K256,64); 1017 1018 sub Xupdate_AVX () { 1019 ( 1020 '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] 1021 '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] 1022 '&vpsrld ($t2,$t0,7);', 1023 '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] 1024 '&vpsrld ($t3,$t0,3);', 1025 '&vpslld ($t1,$t0,14);', 1026 '&vpxor ($t0,$t3,$t2);', 1027 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1028 '&vpsrld ($t2,$t2,18-7);', 1029 '&vpxor ($t0,$t0,$t1);', 1030 '&vpslld ($t1,$t1,25-14);', 1031 '&vpxor ($t0,$t0,$t2);', 1032 '&vpsrld ($t2,$t3,10);', 1033 '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) 1034 '&vpsrlq ($t1,$t3,17);', 1035 '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 1036 '&vpxor ($t2,$t2,$t1);', 1037 '&vpsrlq ($t3,$t3,19);', 1038 '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] 1039 '&vpshufd ($t3,$t2,0b10000100);', 1040 '&vpsrldq ($t3,$t3,8);', 1041 '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) 1042 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1043 '&vpsrld ($t2,$t3,10);', 1044 '&vpsrlq ($t1,$t3,17);', 1045 '&vpxor ($t2,$t2,$t1);', 1046 '&vpsrlq ($t3,$t3,19);', 1047 '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] 1048 '&vpshufd ($t3,$t2,0b11101000);', 1049 '&vpslldq ($t3,$t3,8);', 1050 '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) 1051 ); 1052 } 1053 1054 local *ror = sub { &shrd(@_[0],@_) }; 1055 sub AVX_00_47 () { 1056 my $j = shift; 1057 my $body = shift; 1058 my @X = @_; 1059 my @insns = (&$body,&$body,&$body,&$body); # 120 instructions 1060 my $insn; 1061 1062 foreach (Xupdate_AVX()) { # 31 instructions 1063 eval; 1064 eval(shift(@insns)); 1065 eval(shift(@insns)); 1066 eval($insn = shift(@insns)); 1067 eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); 1068 } 1069 &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); 1070 foreach (@insns) { eval; } # remaining instructions 1071 &vmovdqa (&QWP(32+16*$j,"esp"),$t2); 1072 } 1073 1074 for ($i=0,$j=0; $j<4; $j++) { 1075 &AVX_00_47($j,\&body_00_15,@X); 1076 push(@X,shift(@X)); # rotate(@X) 1077 } 1078 &cmp (&DWP(16*$j,$K256),0x00010203); 1079 &jne (&label("avx_00_47")); 1080 1081 for ($i=0; $i<16; ) { 1082 foreach(body_00_15()) { eval; } 1083 } 1084 1085 &mov ("esi",&DWP(96,"esp")); #ctx 1086 #&mov ($AH[0],&DWP(0,"esp")); 1087 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1088 #&mov ("edi", &DWP(8,"esp")); 1089 &mov ("ecx",&DWP(12,"esp")); 1090 &add ($AH[0],&DWP(0,"esi")); 1091 &add ($AH[1],&DWP(4,"esi")); 1092 &add ("edi",&DWP(8,"esi")); 1093 &add ("ecx",&DWP(12,"esi")); 1094 &mov (&DWP(0,"esi"),$AH[0]); 1095 &mov (&DWP(4,"esi"),$AH[1]); 1096 &mov (&DWP(8,"esi"),"edi"); 1097 &mov (&DWP(12,"esi"),"ecx"); 1098 #&mov (&DWP(0,"esp"),$AH[0]); 1099 &mov (&DWP(4,"esp"),$AH[1]); 1100 &xor ($AH[1],"edi"); # magic 1101 &mov (&DWP(8,"esp"),"edi"); 1102 &mov (&DWP(12,"esp"),"ecx"); 1103 #&mov ($E,&DWP(16,"esp")); 1104 &mov ("edi",&DWP(20,"esp")); 1105 &mov ("ecx",&DWP(24,"esp")); 1106 &add ($E,&DWP(16,"esi")); 1107 &add ("edi",&DWP(20,"esi")); 1108 &add ("ecx",&DWP(24,"esi")); 1109 &mov (&DWP(16,"esi"),$E); 1110 &mov (&DWP(20,"esi"),"edi"); 1111 &mov (&DWP(20,"esp"),"edi"); 1112 &mov ("edi",&DWP(28,"esp")); 1113 &mov (&DWP(24,"esi"),"ecx"); 1114 #&mov (&DWP(16,"esp"),$E); 1115 &add ("edi",&DWP(28,"esi")); 1116 &mov (&DWP(24,"esp"),"ecx"); 1117 &mov (&DWP(28,"esi"),"edi"); 1118 &mov (&DWP(28,"esp"),"edi"); 1119 &mov ("edi",&DWP(96+4,"esp")); # inp 1120 1121 &vmovdqa ($t3,&QWP(64,$K256)); 1122 &sub ($K256,3*64); # rewind K 1123 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1124 &jb (&label("grand_avx")); 1125 1126 &mov ("esp",&DWP(96+12,"esp")); # restore sp 1127 &vzeroall (); 1128 &function_end_A(); 1129 if ($avx>1) { 1130 sub bodyx_00_15 () { # +10% 1131 ( 1132 '&rorx ("ecx",$E,6)', 1133 '&rorx ("esi",$E,11)', 1134 '&mov (&off($e),$E)', # save $E, modulo-scheduled 1135 '&rorx ("edi",$E,25)', 1136 '&xor ("ecx","esi")', 1137 '&andn ("esi",$E,&off($g))', 1138 '&xor ("ecx","edi")', # Sigma1(e) 1139 '&and ($E,&off($f))', 1140 '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 1141 '&or ($E,"esi")', # T = Ch(e,f,g) 1142 1143 '&rorx ("edi",$AH[0],2)', 1144 '&rorx ("esi",$AH[0],13)', 1145 '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) 1146 '&rorx ("ecx",$AH[0],22)', 1147 '&xor ("esi","edi")', 1148 '&mov ("edi",&off($b))', 1149 '&xor ("ecx","esi")', # Sigma0(a) 1150 1151 '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round 1152 '&add ($E,&off($h))', # T += h 1153 '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) 1154 '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] 1155 '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) 1156 1157 '&add ("ecx",$E)', # h += T 1158 '&add ($E,&off($d))', # d += T 1159 '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) 1160 1161 '@AH = reverse(@AH); $i++;' # rotate(a,h) 1162 ); 1163 } 1164 1165 &set_label("AVX_BMI",32); 1166 &lea ("esp",&DWP(-96,"esp")); 1167 &vzeroall (); 1168 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 1169 &mov ($AH[0],&DWP(0,"esi")); 1170 &mov ($AH[1],&DWP(4,"esi")); 1171 &mov ("ecx",&DWP(8,"esi")); 1172 &mov ("edi",&DWP(12,"esi")); 1173 #&mov (&DWP(0,"esp"),$AH[0]); 1174 &mov (&DWP(4,"esp"),$AH[1]); 1175 &xor ($AH[1],"ecx"); # magic 1176 &mov (&DWP(8,"esp"),"ecx"); 1177 &mov (&DWP(12,"esp"),"edi"); 1178 &mov ($E,&DWP(16,"esi")); 1179 &mov ("edi",&DWP(20,"esi")); 1180 &mov ("ecx",&DWP(24,"esi")); 1181 &mov ("esi",&DWP(28,"esi")); 1182 #&mov (&DWP(16,"esp"),$E); 1183 &mov (&DWP(20,"esp"),"edi"); 1184 &mov ("edi",&DWP(96+4,"esp")); # inp 1185 &mov (&DWP(24,"esp"),"ecx"); 1186 &mov (&DWP(28,"esp"),"esi"); 1187 &vmovdqa ($t3,&QWP(256,$K256)); 1188 &jmp (&label("grand_avx_bmi")); 1189 1190 &set_label("grand_avx_bmi",32); 1191 # load input, reverse byte order, add K256[0..15], save to stack 1192 &vmovdqu (@X[0],&QWP(0,"edi")); 1193 &vmovdqu (@X[1],&QWP(16,"edi")); 1194 &vmovdqu (@X[2],&QWP(32,"edi")); 1195 &vmovdqu (@X[3],&QWP(48,"edi")); 1196 &add ("edi",64); 1197 &vpshufb (@X[0],@X[0],$t3); 1198 &mov (&DWP(96+4,"esp"),"edi"); 1199 &vpshufb (@X[1],@X[1],$t3); 1200 &vpshufb (@X[2],@X[2],$t3); 1201 &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1202 &vpshufb (@X[3],@X[3],$t3); 1203 &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1204 &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1205 &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1206 &vmovdqa (&QWP(32+0,"esp"),$t0); 1207 &vmovdqa (&QWP(32+16,"esp"),$t1); 1208 &vmovdqa (&QWP(32+32,"esp"),$t2); 1209 &vmovdqa (&QWP(32+48,"esp"),$t3); 1210 &jmp (&label("avx_bmi_00_47")); 1211 1212 &set_label("avx_bmi_00_47",16); 1213 &add ($K256,64); 1214 1215 for ($i=0,$j=0; $j<4; $j++) { 1216 &AVX_00_47($j,\&bodyx_00_15,@X); 1217 push(@X,shift(@X)); # rotate(@X) 1218 } 1219 &cmp (&DWP(16*$j,$K256),0x00010203); 1220 &jne (&label("avx_bmi_00_47")); 1221 1222 for ($i=0; $i<16; ) { 1223 foreach(bodyx_00_15()) { eval; } 1224 } 1225 1226 &mov ("esi",&DWP(96,"esp")); #ctx 1227 #&mov ($AH[0],&DWP(0,"esp")); 1228 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1229 #&mov ("edi", &DWP(8,"esp")); 1230 &mov ("ecx",&DWP(12,"esp")); 1231 &add ($AH[0],&DWP(0,"esi")); 1232 &add ($AH[1],&DWP(4,"esi")); 1233 &add ("edi",&DWP(8,"esi")); 1234 &add ("ecx",&DWP(12,"esi")); 1235 &mov (&DWP(0,"esi"),$AH[0]); 1236 &mov (&DWP(4,"esi"),$AH[1]); 1237 &mov (&DWP(8,"esi"),"edi"); 1238 &mov (&DWP(12,"esi"),"ecx"); 1239 #&mov (&DWP(0,"esp"),$AH[0]); 1240 &mov (&DWP(4,"esp"),$AH[1]); 1241 &xor ($AH[1],"edi"); # magic 1242 &mov (&DWP(8,"esp"),"edi"); 1243 &mov (&DWP(12,"esp"),"ecx"); 1244 #&mov ($E,&DWP(16,"esp")); 1245 &mov ("edi",&DWP(20,"esp")); 1246 &mov ("ecx",&DWP(24,"esp")); 1247 &add ($E,&DWP(16,"esi")); 1248 &add ("edi",&DWP(20,"esi")); 1249 &add ("ecx",&DWP(24,"esi")); 1250 &mov (&DWP(16,"esi"),$E); 1251 &mov (&DWP(20,"esi"),"edi"); 1252 &mov (&DWP(20,"esp"),"edi"); 1253 &mov ("edi",&DWP(28,"esp")); 1254 &mov (&DWP(24,"esi"),"ecx"); 1255 #&mov (&DWP(16,"esp"),$E); 1256 &add ("edi",&DWP(28,"esi")); 1257 &mov (&DWP(24,"esp"),"ecx"); 1258 &mov (&DWP(28,"esi"),"edi"); 1259 &mov (&DWP(28,"esp"),"edi"); 1260 &mov ("edi",&DWP(96+4,"esp")); # inp 1261 1262 &vmovdqa ($t3,&QWP(64,$K256)); 1263 &sub ($K256,3*64); # rewind K 1264 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1265 &jb (&label("grand_avx_bmi")); 1266 1267 &mov ("esp",&DWP(96+12,"esp")); # restore sp 1268 &vzeroall (); 1269 &function_end_A(); 1270 } 1271 } 1272 }}} 1273 &function_end_B("sha256_block_data_order"); 1274 1275 &asm_finish(); 1276