1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # March, June 2010 11 # 12 # The module implements "4-bit" GCM GHASH function and underlying 13 # single multiplication operation in GF(2^128). "4-bit" means that 14 # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 15 # function features so called "528B" variant utilizing additional 16 # 256+16 bytes of per-key storage [+512 bytes shared table]. 17 # Performance results are for this streamed GHASH subroutine and are 18 # expressed in cycles per processed byte, less is better: 19 # 20 # gcc 3.4.x(*) assembler 21 # 22 # P4 28.6 14.0 +100% 23 # Opteron 19.3 7.7 +150% 24 # Core2 17.8 8.1(**) +120% 25 # Atom 31.6 16.8 +88% 26 # VIA Nano 21.8 10.1 +115% 27 # 28 # (*) comparison is not completely fair, because C results are 29 # for vanilla "256B" implementation, while assembler results 30 # are for "528B";-) 31 # (**) it's mystery [to me] why Core2 result is not same as for 32 # Opteron; 33 34 # May 2010 35 # 36 # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 37 # See ghash-x86.pl for background information and details about coding 38 # techniques. 39 # 40 # Special thanks to David Woodhouse <dwmw2 (at] infradead.org> for 41 # providing access to a Westmere-based system on behalf of Intel 42 # Open Source Technology Centre. 43 44 # December 2012 45 # 46 # Overhaul: aggregate Karatsuba post-processing, improve ILP in 47 # reduction_alg9, increase reduction aggregate factor to 4x. As for 48 # the latter. ghash-x86.pl discusses that it makes lesser sense to 49 # increase aggregate factor. Then why increase here? Critical path 50 # consists of 3 independent pclmulqdq instructions, Karatsuba post- 51 # processing and reduction. "On top" of this we lay down aggregated 52 # multiplication operations, triplets of independent pclmulqdq's. As 53 # issue rate for pclmulqdq is limited, it makes lesser sense to 54 # aggregate more multiplications than it takes to perform remaining 55 # non-multiplication operations. 2x is near-optimal coefficient for 56 # contemporary Intel CPUs (therefore modest improvement coefficient), 57 # but not for Bulldozer. Latter is because logical SIMD operations 58 # are twice as slow in comparison to Intel, so that critical path is 59 # longer. A CPU with higher pclmulqdq issue rate would also benefit 60 # from higher aggregate factor... 61 # 62 # Westmere 1.78(+13%) 63 # Sandy Bridge 1.80(+8%) 64 # Ivy Bridge 1.80(+7%) 65 # Haswell 0.55(+93%) (if system doesn't support AVX) 66 # Bulldozer 1.49(+27%) 67 68 # March 2013 69 # 70 # ... 8x aggregate factor AVX code path is using reduction algorithm 71 # suggested by Shay Gueron[1]. Even though contemporary AVX-capable 72 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs 73 # sub-optimally in comparison to above mentioned version. But thanks 74 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 75 # it performs in 0.41 cycles per byte on Haswell processor. 76 # 77 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 78 79 $flavour = shift; 80 $output = shift; 81 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 82 83 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 84 85 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 86 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 87 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 88 die "can't locate x86_64-xlate.pl"; 89 90 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 91 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 92 $avx = ($1>=2.19) + ($1>=2.22); 93 } 94 95 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 96 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 97 $avx = ($1>=2.09) + ($1>=2.10); 98 } 99 100 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 101 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 102 $avx = ($1>=10) + ($1>=11); 103 } 104 105 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { 106 $avx = ($2>=3.0) + ($2>3.0); 107 } 108 109 open OUT,"| \"$^X\" $xlate $flavour $output"; 110 *STDOUT=*OUT; 111 112 $do4xaggr=1; 113 114 # common register layout 115 $nlo="%rax"; 116 $nhi="%rbx"; 117 $Zlo="%r8"; 118 $Zhi="%r9"; 119 $tmp="%r10"; 120 $rem_4bit = "%r11"; 121 122 $Xi="%rdi"; 123 $Htbl="%rsi"; 124 125 # per-function register layout 126 $cnt="%rcx"; 127 $rem="%rdx"; 128 129 sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or 130 $r =~ s/%[er]([sd]i)/%\1l/ or 131 $r =~ s/%[er](bp)/%\1l/ or 132 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 133 134 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 135 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 136 my $arg = pop; 137 $arg = "\$$arg" if ($arg*1 eq $arg); 138 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 139 } 140 142 { my $N; 143 sub loop() { 144 my $inp = shift; 145 146 $N++; 147 $code.=<<___; 148 xor $nlo,$nlo 149 xor $nhi,$nhi 150 mov `&LB("$Zlo")`,`&LB("$nlo")` 151 mov `&LB("$Zlo")`,`&LB("$nhi")` 152 shl \$4,`&LB("$nlo")` 153 mov \$14,$cnt 154 mov 8($Htbl,$nlo),$Zlo 155 mov ($Htbl,$nlo),$Zhi 156 and \$0xf0,`&LB("$nhi")` 157 mov $Zlo,$rem 158 jmp .Loop$N 159 160 .align 16 161 .Loop$N: 162 shr \$4,$Zlo 163 and \$0xf,$rem 164 mov $Zhi,$tmp 165 mov ($inp,$cnt),`&LB("$nlo")` 166 shr \$4,$Zhi 167 xor 8($Htbl,$nhi),$Zlo 168 shl \$60,$tmp 169 xor ($Htbl,$nhi),$Zhi 170 mov `&LB("$nlo")`,`&LB("$nhi")` 171 xor ($rem_4bit,$rem,8),$Zhi 172 mov $Zlo,$rem 173 shl \$4,`&LB("$nlo")` 174 xor $tmp,$Zlo 175 dec $cnt 176 js .Lbreak$N 177 178 shr \$4,$Zlo 179 and \$0xf,$rem 180 mov $Zhi,$tmp 181 shr \$4,$Zhi 182 xor 8($Htbl,$nlo),$Zlo 183 shl \$60,$tmp 184 xor ($Htbl,$nlo),$Zhi 185 and \$0xf0,`&LB("$nhi")` 186 xor ($rem_4bit,$rem,8),$Zhi 187 mov $Zlo,$rem 188 xor $tmp,$Zlo 189 jmp .Loop$N 190 191 .align 16 192 .Lbreak$N: 193 shr \$4,$Zlo 194 and \$0xf,$rem 195 mov $Zhi,$tmp 196 shr \$4,$Zhi 197 xor 8($Htbl,$nlo),$Zlo 198 shl \$60,$tmp 199 xor ($Htbl,$nlo),$Zhi 200 and \$0xf0,`&LB("$nhi")` 201 xor ($rem_4bit,$rem,8),$Zhi 202 mov $Zlo,$rem 203 xor $tmp,$Zlo 204 205 shr \$4,$Zlo 206 and \$0xf,$rem 207 mov $Zhi,$tmp 208 shr \$4,$Zhi 209 xor 8($Htbl,$nhi),$Zlo 210 shl \$60,$tmp 211 xor ($Htbl,$nhi),$Zhi 212 xor $tmp,$Zlo 213 xor ($rem_4bit,$rem,8),$Zhi 214 215 bswap $Zlo 216 bswap $Zhi 217 ___ 218 }} 219 220 $code=<<___; 221 .text 222 .extern OPENSSL_ia32cap_P 223 224 .globl gcm_gmult_4bit 225 .type gcm_gmult_4bit,\@function,2 226 .align 16 227 gcm_gmult_4bit: 228 push %rbx 229 push %rbp # %rbp and %r12 are pushed exclusively in 230 push %r12 # order to reuse Win64 exception handler... 231 .Lgmult_prologue: 232 233 movzb 15($Xi),$Zlo 234 lea .Lrem_4bit(%rip),$rem_4bit 235 ___ 236 &loop ($Xi); 237 $code.=<<___; 238 mov $Zlo,8($Xi) 239 mov $Zhi,($Xi) 240 241 mov 16(%rsp),%rbx 242 lea 24(%rsp),%rsp 243 .Lgmult_epilogue: 244 ret 245 .size gcm_gmult_4bit,.-gcm_gmult_4bit 246 ___ 247 249 # per-function register layout 250 $inp="%rdx"; 251 $len="%rcx"; 252 $rem_8bit=$rem_4bit; 253 254 $code.=<<___; 255 .globl gcm_ghash_4bit 256 .type gcm_ghash_4bit,\@function,4 257 .align 16 258 gcm_ghash_4bit: 259 push %rbx 260 push %rbp 261 push %r12 262 push %r13 263 push %r14 264 push %r15 265 sub \$280,%rsp 266 .Lghash_prologue: 267 mov $inp,%r14 # reassign couple of args 268 mov $len,%r15 269 ___ 270 { my $inp="%r14"; 271 my $dat="%edx"; 272 my $len="%r15"; 273 my @nhi=("%ebx","%ecx"); 274 my @rem=("%r12","%r13"); 275 my $Hshr4="%rbp"; 276 277 &sub ($Htbl,-128); # size optimization 278 &lea ($Hshr4,"16+128(%rsp)"); 279 { my @lo =($nlo,$nhi); 280 my @hi =($Zlo,$Zhi); 281 282 &xor ($dat,$dat); 283 for ($i=0,$j=-2;$i<18;$i++,$j++) { 284 &mov ("$j(%rsp)",&LB($dat)) if ($i>1); 285 &or ($lo[0],$tmp) if ($i>1); 286 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); 287 &shr ($lo[1],4) if ($i>0 && $i<17); 288 &mov ($tmp,$hi[1]) if ($i>0 && $i<17); 289 &shr ($hi[1],4) if ($i>0 && $i<17); 290 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); 291 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); 292 &shl (&LB($dat),4) if ($i>0 && $i<17); 293 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); 294 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); 295 &shl ($tmp,60) if ($i>0 && $i<17); 296 297 push (@lo,shift(@lo)); 298 push (@hi,shift(@hi)); 299 } 300 } 301 &add ($Htbl,-128); 302 &mov ($Zlo,"8($Xi)"); 303 &mov ($Zhi,"0($Xi)"); 304 &add ($len,$inp); # pointer to the end of data 305 &lea ($rem_8bit,".Lrem_8bit(%rip)"); 306 &jmp (".Louter_loop"); 307 308 $code.=".align 16\n.Louter_loop:\n"; 309 &xor ($Zhi,"($inp)"); 310 &mov ("%rdx","8($inp)"); 311 &lea ($inp,"16($inp)"); 312 &xor ("%rdx",$Zlo); 313 &mov ("($Xi)",$Zhi); 314 &mov ("8($Xi)","%rdx"); 315 &shr ("%rdx",32); 316 317 &xor ($nlo,$nlo); 318 &rol ($dat,8); 319 &mov (&LB($nlo),&LB($dat)); 320 &movz ($nhi[0],&LB($dat)); 321 &shl (&LB($nlo),4); 322 &shr ($nhi[0],4); 323 324 for ($j=11,$i=0;$i<15;$i++) { 325 &rol ($dat,8); 326 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); 327 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); 328 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); 329 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); 330 331 &mov (&LB($nlo),&LB($dat)); 332 &xor ($Zlo,$tmp) if ($i>0); 333 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); 334 335 &movz ($nhi[1],&LB($dat)); 336 &shl (&LB($nlo),4); 337 &movzb ($rem[0],"(%rsp,$nhi[0])"); 338 339 &shr ($nhi[1],4) if ($i<14); 340 &and ($nhi[1],0xf0) if ($i==14); 341 &shl ($rem[1],48) if ($i>0); 342 &xor ($rem[0],$Zlo); 343 344 &mov ($tmp,$Zhi); 345 &xor ($Zhi,$rem[1]) if ($i>0); 346 &shr ($Zlo,8); 347 348 &movz ($rem[0],&LB($rem[0])); 349 &mov ($dat,"$j($Xi)") if (--$j%4==0); 350 &shr ($Zhi,8); 351 352 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); 353 &shl ($tmp,56); 354 &xor ($Zhi,"($Hshr4,$nhi[0],8)"); 355 356 unshift (@nhi,pop(@nhi)); # "rotate" registers 357 unshift (@rem,pop(@rem)); 358 } 359 &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); 360 &xor ($Zlo,"8($Htbl,$nlo)"); 361 &xor ($Zhi,"($Htbl,$nlo)"); 362 363 &shl ($rem[1],48); 364 &xor ($Zlo,$tmp); 365 366 &xor ($Zhi,$rem[1]); 367 &movz ($rem[0],&LB($Zlo)); 368 &shr ($Zlo,4); 369 370 &mov ($tmp,$Zhi); 371 &shl (&LB($rem[0]),4); 372 &shr ($Zhi,4); 373 374 &xor ($Zlo,"8($Htbl,$nhi[0])"); 375 &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); 376 &shl ($tmp,60); 377 378 &xor ($Zhi,"($Htbl,$nhi[0])"); 379 &xor ($Zlo,$tmp); 380 &shl ($rem[0],48); 381 382 &bswap ($Zlo); 383 &xor ($Zhi,$rem[0]); 384 385 &bswap ($Zhi); 386 &cmp ($inp,$len); 387 &jb (".Louter_loop"); 388 } 389 $code.=<<___; 390 mov $Zlo,8($Xi) 391 mov $Zhi,($Xi) 392 393 lea 280(%rsp),%rsi 394 mov 0(%rsi),%r15 395 mov 8(%rsi),%r14 396 mov 16(%rsi),%r13 397 mov 24(%rsi),%r12 398 mov 32(%rsi),%rbp 399 mov 40(%rsi),%rbx 400 lea 48(%rsi),%rsp 401 .Lghash_epilogue: 402 ret 403 .size gcm_ghash_4bit,.-gcm_ghash_4bit 404 ___ 405 407 ###################################################################### 408 # PCLMULQDQ version. 409 410 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 411 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 412 413 ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 414 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 415 416 sub clmul64x64_T2 { # minimal register pressure 417 my ($Xhi,$Xi,$Hkey,$HK)=@_; 418 419 if (!defined($HK)) { $HK = $T2; 420 $code.=<<___; 421 movdqa $Xi,$Xhi # 422 pshufd \$0b01001110,$Xi,$T1 423 pshufd \$0b01001110,$Hkey,$T2 424 pxor $Xi,$T1 # 425 pxor $Hkey,$T2 426 ___ 427 } else { 428 $code.=<<___; 429 movdqa $Xi,$Xhi # 430 pshufd \$0b01001110,$Xi,$T1 431 pxor $Xi,$T1 # 432 ___ 433 } 434 $code.=<<___; 435 pclmulqdq \$0x00,$Hkey,$Xi ####### 436 pclmulqdq \$0x11,$Hkey,$Xhi ####### 437 pclmulqdq \$0x00,$HK,$T1 ####### 438 pxor $Xi,$T1 # 439 pxor $Xhi,$T1 # 440 441 movdqa $T1,$T2 # 442 psrldq \$8,$T1 443 pslldq \$8,$T2 # 444 pxor $T1,$Xhi 445 pxor $T2,$Xi # 446 ___ 447 } 448 449 sub reduction_alg9 { # 17/11 times faster than Intel version 450 my ($Xhi,$Xi) = @_; 451 452 $code.=<<___; 453 # 1st phase 454 movdqa $Xi,$T2 # 455 movdqa $Xi,$T1 456 psllq \$5,$Xi 457 pxor $Xi,$T1 # 458 psllq \$1,$Xi 459 pxor $T1,$Xi # 460 psllq \$57,$Xi # 461 movdqa $Xi,$T1 # 462 pslldq \$8,$Xi 463 psrldq \$8,$T1 # 464 pxor $T2,$Xi 465 pxor $T1,$Xhi # 466 467 # 2nd phase 468 movdqa $Xi,$T2 469 psrlq \$1,$Xi 470 pxor $T2,$Xhi # 471 pxor $Xi,$T2 472 psrlq \$5,$Xi 473 pxor $T2,$Xi # 474 psrlq \$1,$Xi # 475 pxor $Xhi,$Xi # 476 ___ 477 } 478 480 { my ($Htbl,$Xip)=@_4args; 481 my $HK="%xmm6"; 482 483 $code.=<<___; 484 .globl gcm_init_clmul 485 .type gcm_init_clmul,\@abi-omnipotent 486 .align 16 487 gcm_init_clmul: 488 .L_init_clmul: 489 ___ 490 $code.=<<___ if ($win64); 491 .LSEH_begin_gcm_init_clmul: 492 # I can't trust assembler to use specific encoding:-( 493 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 494 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 495 ___ 496 $code.=<<___; 497 movdqu ($Xip),$Hkey 498 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 499 500 # <<1 twist 501 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 502 movdqa $Hkey,$T1 503 psllq \$1,$Hkey 504 pxor $T3,$T3 # 505 psrlq \$63,$T1 506 pcmpgtd $T2,$T3 # broadcast carry bit 507 pslldq \$8,$T1 508 por $T1,$Hkey # H<<=1 509 510 # magic reduction 511 pand .L0x1c2_polynomial(%rip),$T3 512 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 513 514 # calculate H^2 515 pshufd \$0b01001110,$Hkey,$HK 516 movdqa $Hkey,$Xi 517 pxor $Hkey,$HK 518 ___ 519 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 520 &reduction_alg9 ($Xhi,$Xi); 521 $code.=<<___; 522 pshufd \$0b01001110,$Hkey,$T1 523 pshufd \$0b01001110,$Xi,$T2 524 pxor $Hkey,$T1 # Karatsuba pre-processing 525 movdqu $Hkey,0x00($Htbl) # save H 526 pxor $Xi,$T2 # Karatsuba pre-processing 527 movdqu $Xi,0x10($Htbl) # save H^2 528 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 529 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 530 ___ 531 if ($do4xaggr) { 532 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 533 &reduction_alg9 ($Xhi,$Xi); 534 $code.=<<___; 535 movdqa $Xi,$T3 536 ___ 537 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 538 &reduction_alg9 ($Xhi,$Xi); 539 $code.=<<___; 540 pshufd \$0b01001110,$T3,$T1 541 pshufd \$0b01001110,$Xi,$T2 542 pxor $T3,$T1 # Karatsuba pre-processing 543 movdqu $T3,0x30($Htbl) # save H^3 544 pxor $Xi,$T2 # Karatsuba pre-processing 545 movdqu $Xi,0x40($Htbl) # save H^4 546 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 547 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 548 ___ 549 } 550 $code.=<<___ if ($win64); 551 movaps (%rsp),%xmm6 552 lea 0x18(%rsp),%rsp 553 .LSEH_end_gcm_init_clmul: 554 ___ 555 $code.=<<___; 556 ret 557 .size gcm_init_clmul,.-gcm_init_clmul 558 ___ 559 } 560 561 { my ($Xip,$Htbl)=@_4args; 562 563 $code.=<<___; 564 .globl gcm_gmult_clmul 565 .type gcm_gmult_clmul,\@abi-omnipotent 566 .align 16 567 gcm_gmult_clmul: 568 .L_gmult_clmul: 569 movdqu ($Xip),$Xi 570 movdqa .Lbswap_mask(%rip),$T3 571 movdqu ($Htbl),$Hkey 572 movdqu 0x20($Htbl),$T2 573 pshufb $T3,$Xi 574 ___ 575 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 576 $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 577 # experimental alternative. special thing about is that there 578 # no dependency between the two multiplications... 579 mov \$`0xE1<<1`,%eax 580 mov \$0xA040608020C0E000,%r10 # ((7..0)0xE0)&0xff 581 mov \$0x07,%r11d 582 movq %rax,$T1 583 movq %r10,$T2 584 movq %r11,$T3 # borrow $T3 585 pand $Xi,$T3 586 pshufb $T3,$T2 # ($Xi&7)0xE0 587 movq %rax,$T3 588 pclmulqdq \$0x00,$Xi,$T1 # (0xE1<<1) 589 pxor $Xi,$T2 590 pslldq \$15,$T2 591 paddd $T2,$T2 # <<(64+56+1) 592 pxor $T2,$Xi 593 pclmulqdq \$0x01,$T3,$Xi 594 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 595 psrldq \$1,$T1 596 pxor $T1,$Xhi 597 pslldq \$7,$Xi 598 pxor $Xhi,$Xi 599 ___ 600 $code.=<<___; 601 pshufb $T3,$Xi 602 movdqu $Xi,($Xip) 603 ret 604 .size gcm_gmult_clmul,.-gcm_gmult_clmul 605 ___ 606 } 607 609 { my ($Xip,$Htbl,$inp,$len)=@_4args; 610 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 611 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 612 613 $code.=<<___; 614 .globl gcm_ghash_clmul 615 .type gcm_ghash_clmul,\@abi-omnipotent 616 .align 32 617 gcm_ghash_clmul: 618 .L_ghash_clmul: 619 ___ 620 $code.=<<___ if ($win64); 621 lea -0x88(%rsp),%rax 622 .LSEH_begin_gcm_ghash_clmul: 623 # I can't trust assembler to use specific encoding:-( 624 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 625 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 626 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 627 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 628 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 629 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 630 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 631 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 632 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 633 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 634 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 635 ___ 636 $code.=<<___; 637 movdqa .Lbswap_mask(%rip),$T3 638 639 movdqu ($Xip),$Xi 640 movdqu ($Htbl),$Hkey 641 movdqu 0x20($Htbl),$HK 642 pshufb $T3,$Xi 643 644 sub \$0x10,$len 645 jz .Lodd_tail 646 647 movdqu 0x10($Htbl),$Hkey2 648 ___ 649 if ($do4xaggr) { 650 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 651 652 $code.=<<___; 653 mov OPENSSL_ia32cap_P+4(%rip),%eax 654 cmp \$0x30,$len 655 jb .Lskip4x 656 657 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 658 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 659 je .Lskip4x 660 661 sub \$0x30,$len 662 mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff 663 movdqu 0x30($Htbl),$Hkey3 664 movdqu 0x40($Htbl),$Hkey4 665 666 ####### 667 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 668 # 669 movdqu 0x30($inp),$Xln 670 movdqu 0x20($inp),$Xl 671 pshufb $T3,$Xln 672 pshufb $T3,$Xl 673 movdqa $Xln,$Xhn 674 pshufd \$0b01001110,$Xln,$Xmn 675 pxor $Xln,$Xmn 676 pclmulqdq \$0x00,$Hkey,$Xln 677 pclmulqdq \$0x11,$Hkey,$Xhn 678 pclmulqdq \$0x00,$HK,$Xmn 679 680 movdqa $Xl,$Xh 681 pshufd \$0b01001110,$Xl,$Xm 682 pxor $Xl,$Xm 683 pclmulqdq \$0x00,$Hkey2,$Xl 684 pclmulqdq \$0x11,$Hkey2,$Xh 685 pclmulqdq \$0x10,$HK,$Xm 686 xorps $Xl,$Xln 687 xorps $Xh,$Xhn 688 movups 0x50($Htbl),$HK 689 xorps $Xm,$Xmn 690 691 movdqu 0x10($inp),$Xl 692 movdqu 0($inp),$T1 693 pshufb $T3,$Xl 694 pshufb $T3,$T1 695 movdqa $Xl,$Xh 696 pshufd \$0b01001110,$Xl,$Xm 697 pxor $T1,$Xi 698 pxor $Xl,$Xm 699 pclmulqdq \$0x00,$Hkey3,$Xl 700 movdqa $Xi,$Xhi 701 pshufd \$0b01001110,$Xi,$T1 702 pxor $Xi,$T1 703 pclmulqdq \$0x11,$Hkey3,$Xh 704 pclmulqdq \$0x00,$HK,$Xm 705 xorps $Xl,$Xln 706 xorps $Xh,$Xhn 707 708 lea 0x40($inp),$inp 709 sub \$0x40,$len 710 jc .Ltail4x 711 712 jmp .Lmod4_loop 713 .align 32 714 .Lmod4_loop: 715 pclmulqdq \$0x00,$Hkey4,$Xi 716 xorps $Xm,$Xmn 717 movdqu 0x30($inp),$Xl 718 pshufb $T3,$Xl 719 pclmulqdq \$0x11,$Hkey4,$Xhi 720 xorps $Xln,$Xi 721 movdqu 0x20($inp),$Xln 722 movdqa $Xl,$Xh 723 pclmulqdq \$0x10,$HK,$T1 724 pshufd \$0b01001110,$Xl,$Xm 725 xorps $Xhn,$Xhi 726 pxor $Xl,$Xm 727 pshufb $T3,$Xln 728 movups 0x20($Htbl),$HK 729 xorps $Xmn,$T1 730 pclmulqdq \$0x00,$Hkey,$Xl 731 pshufd \$0b01001110,$Xln,$Xmn 732 733 pxor $Xi,$T1 # aggregated Karatsuba post-processing 734 movdqa $Xln,$Xhn 735 pxor $Xhi,$T1 # 736 pxor $Xln,$Xmn 737 movdqa $T1,$T2 # 738 pclmulqdq \$0x11,$Hkey,$Xh 739 pslldq \$8,$T1 740 psrldq \$8,$T2 # 741 pxor $T1,$Xi 742 movdqa .L7_mask(%rip),$T1 743 pxor $T2,$Xhi # 744 movq %rax,$T2 745 746 pand $Xi,$T1 # 1st phase 747 pshufb $T1,$T2 # 748 pxor $Xi,$T2 # 749 pclmulqdq \$0x00,$HK,$Xm 750 psllq \$57,$T2 # 751 movdqa $T2,$T1 # 752 pslldq \$8,$T2 753 pclmulqdq \$0x00,$Hkey2,$Xln 754 psrldq \$8,$T1 # 755 pxor $T2,$Xi 756 pxor $T1,$Xhi # 757 movdqu 0($inp),$T1 758 759 movdqa $Xi,$T2 # 2nd phase 760 psrlq \$1,$Xi 761 pclmulqdq \$0x11,$Hkey2,$Xhn 762 xorps $Xl,$Xln 763 movdqu 0x10($inp),$Xl 764 pshufb $T3,$Xl 765 pclmulqdq \$0x10,$HK,$Xmn 766 xorps $Xh,$Xhn 767 movups 0x50($Htbl),$HK 768 pshufb $T3,$T1 769 pxor $T2,$Xhi # 770 pxor $Xi,$T2 771 psrlq \$5,$Xi 772 773 movdqa $Xl,$Xh 774 pxor $Xm,$Xmn 775 pshufd \$0b01001110,$Xl,$Xm 776 pxor $T2,$Xi # 777 pxor $T1,$Xhi 778 pxor $Xl,$Xm 779 pclmulqdq \$0x00,$Hkey3,$Xl 780 psrlq \$1,$Xi # 781 pxor $Xhi,$Xi # 782 movdqa $Xi,$Xhi 783 pclmulqdq \$0x11,$Hkey3,$Xh 784 xorps $Xl,$Xln 785 pshufd \$0b01001110,$Xi,$T1 786 pxor $Xi,$T1 787 788 pclmulqdq \$0x00,$HK,$Xm 789 xorps $Xh,$Xhn 790 791 lea 0x40($inp),$inp 792 sub \$0x40,$len 793 jnc .Lmod4_loop 794 795 .Ltail4x: 796 pclmulqdq \$0x00,$Hkey4,$Xi 797 pclmulqdq \$0x11,$Hkey4,$Xhi 798 pclmulqdq \$0x10,$HK,$T1 799 xorps $Xm,$Xmn 800 xorps $Xln,$Xi 801 xorps $Xhn,$Xhi 802 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 803 pxor $Xmn,$T1 804 805 pxor $Xhi,$T1 # 806 pxor $Xi,$Xhi 807 808 movdqa $T1,$T2 # 809 psrldq \$8,$T1 810 pslldq \$8,$T2 # 811 pxor $T1,$Xhi 812 pxor $T2,$Xi # 813 ___ 814 &reduction_alg9($Xhi,$Xi); 815 $code.=<<___; 816 add \$0x40,$len 817 jz .Ldone 818 movdqu 0x20($Htbl),$HK 819 sub \$0x10,$len 820 jz .Lodd_tail 821 .Lskip4x: 822 ___ 823 } 824 $code.=<<___; 825 ####### 826 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 827 # [(H*Ii+1) + (H*Xi+1)] mod P = 828 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 829 # 830 movdqu ($inp),$T1 # Ii 831 movdqu 16($inp),$Xln # Ii+1 832 pshufb $T3,$T1 833 pshufb $T3,$Xln 834 pxor $T1,$Xi # Ii+Xi 835 836 movdqa $Xln,$Xhn 837 pshufd \$0b01001110,$Xln,$Xmn 838 pxor $Xln,$Xmn 839 pclmulqdq \$0x00,$Hkey,$Xln 840 pclmulqdq \$0x11,$Hkey,$Xhn 841 pclmulqdq \$0x00,$HK,$Xmn 842 843 lea 32($inp),$inp # i+=2 844 nop 845 sub \$0x20,$len 846 jbe .Leven_tail 847 nop 848 jmp .Lmod_loop 849 850 .align 32 851 .Lmod_loop: 852 movdqa $Xi,$Xhi 853 movdqa $Xmn,$T1 854 pshufd \$0b01001110,$Xi,$Xmn # 855 pxor $Xi,$Xmn # 856 857 pclmulqdq \$0x00,$Hkey2,$Xi 858 pclmulqdq \$0x11,$Hkey2,$Xhi 859 pclmulqdq \$0x10,$HK,$Xmn 860 861 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 862 pxor $Xhn,$Xhi 863 movdqu ($inp),$T2 # Ii 864 pxor $Xi,$T1 # aggregated Karatsuba post-processing 865 pshufb $T3,$T2 866 movdqu 16($inp),$Xln # Ii+1 867 868 pxor $Xhi,$T1 869 pxor $T2,$Xhi # "Ii+Xi", consume early 870 pxor $T1,$Xmn 871 pshufb $T3,$Xln 872 movdqa $Xmn,$T1 # 873 psrldq \$8,$T1 874 pslldq \$8,$Xmn # 875 pxor $T1,$Xhi 876 pxor $Xmn,$Xi # 877 878 movdqa $Xln,$Xhn # 879 880 movdqa $Xi,$T2 # 1st phase 881 movdqa $Xi,$T1 882 psllq \$5,$Xi 883 pxor $Xi,$T1 # 884 pclmulqdq \$0x00,$Hkey,$Xln ####### 885 psllq \$1,$Xi 886 pxor $T1,$Xi # 887 psllq \$57,$Xi # 888 movdqa $Xi,$T1 # 889 pslldq \$8,$Xi 890 psrldq \$8,$T1 # 891 pxor $T2,$Xi 892 pshufd \$0b01001110,$Xhn,$Xmn 893 pxor $T1,$Xhi # 894 pxor $Xhn,$Xmn # 895 896 movdqa $Xi,$T2 # 2nd phase 897 psrlq \$1,$Xi 898 pclmulqdq \$0x11,$Hkey,$Xhn ####### 899 pxor $T2,$Xhi # 900 pxor $Xi,$T2 901 psrlq \$5,$Xi 902 pxor $T2,$Xi # 903 lea 32($inp),$inp 904 psrlq \$1,$Xi # 905 pclmulqdq \$0x00,$HK,$Xmn ####### 906 pxor $Xhi,$Xi # 907 908 sub \$0x20,$len 909 ja .Lmod_loop 910 911 .Leven_tail: 912 movdqa $Xi,$Xhi 913 movdqa $Xmn,$T1 914 pshufd \$0b01001110,$Xi,$Xmn # 915 pxor $Xi,$Xmn # 916 917 pclmulqdq \$0x00,$Hkey2,$Xi 918 pclmulqdq \$0x11,$Hkey2,$Xhi 919 pclmulqdq \$0x10,$HK,$Xmn 920 921 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 922 pxor $Xhn,$Xhi 923 pxor $Xi,$T1 924 pxor $Xhi,$T1 925 pxor $T1,$Xmn 926 movdqa $Xmn,$T1 # 927 psrldq \$8,$T1 928 pslldq \$8,$Xmn # 929 pxor $T1,$Xhi 930 pxor $Xmn,$Xi # 931 ___ 932 &reduction_alg9 ($Xhi,$Xi); 933 $code.=<<___; 934 test $len,$len 935 jnz .Ldone 936 937 .Lodd_tail: 938 movdqu ($inp),$T1 # Ii 939 pshufb $T3,$T1 940 pxor $T1,$Xi # Ii+Xi 941 ___ 942 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 943 &reduction_alg9 ($Xhi,$Xi); 944 $code.=<<___; 945 .Ldone: 946 pshufb $T3,$Xi 947 movdqu $Xi,($Xip) 948 ___ 949 $code.=<<___ if ($win64); 950 movaps (%rsp),%xmm6 951 movaps 0x10(%rsp),%xmm7 952 movaps 0x20(%rsp),%xmm8 953 movaps 0x30(%rsp),%xmm9 954 movaps 0x40(%rsp),%xmm10 955 movaps 0x50(%rsp),%xmm11 956 movaps 0x60(%rsp),%xmm12 957 movaps 0x70(%rsp),%xmm13 958 movaps 0x80(%rsp),%xmm14 959 movaps 0x90(%rsp),%xmm15 960 lea 0xa8(%rsp),%rsp 961 .LSEH_end_gcm_ghash_clmul: 962 ___ 963 $code.=<<___; 964 ret 965 .size gcm_ghash_clmul,.-gcm_ghash_clmul 966 ___ 967 } 968 970 $code.=<<___; 971 .globl gcm_init_avx 972 .type gcm_init_avx,\@abi-omnipotent 973 .align 32 974 gcm_init_avx: 975 ___ 976 if ($avx) { 977 my ($Htbl,$Xip)=@_4args; 978 my $HK="%xmm6"; 979 980 $code.=<<___ if ($win64); 981 .LSEH_begin_gcm_init_avx: 982 # I can't trust assembler to use specific encoding:-( 983 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 984 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 985 ___ 986 $code.=<<___; 987 vzeroupper 988 989 vmovdqu ($Xip),$Hkey 990 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 991 992 # <<1 twist 993 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 994 vpsrlq \$63,$Hkey,$T1 995 vpsllq \$1,$Hkey,$Hkey 996 vpxor $T3,$T3,$T3 # 997 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 998 vpslldq \$8,$T1,$T1 999 vpor $T1,$Hkey,$Hkey # H<<=1 1000 1001 # magic reduction 1002 vpand .L0x1c2_polynomial(%rip),$T3,$T3 1003 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 1004 1005 vpunpckhqdq $Hkey,$Hkey,$HK 1006 vmovdqa $Hkey,$Xi 1007 vpxor $Hkey,$HK,$HK 1008 mov \$4,%r10 # up to H^8 1009 jmp .Linit_start_avx 1010 ___ 1011 1012 sub clmul64x64_avx { 1013 my ($Xhi,$Xi,$Hkey,$HK)=@_; 1014 1015 if (!defined($HK)) { $HK = $T2; 1016 $code.=<<___; 1017 vpunpckhqdq $Xi,$Xi,$T1 1018 vpunpckhqdq $Hkey,$Hkey,$T2 1019 vpxor $Xi,$T1,$T1 # 1020 vpxor $Hkey,$T2,$T2 1021 ___ 1022 } else { 1023 $code.=<<___; 1024 vpunpckhqdq $Xi,$Xi,$T1 1025 vpxor $Xi,$T1,$T1 # 1026 ___ 1027 } 1028 $code.=<<___; 1029 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 1030 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 1031 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 1032 vpxor $Xi,$Xhi,$T2 # 1033 vpxor $T2,$T1,$T1 # 1034 1035 vpslldq \$8,$T1,$T2 # 1036 vpsrldq \$8,$T1,$T1 1037 vpxor $T2,$Xi,$Xi # 1038 vpxor $T1,$Xhi,$Xhi 1039 ___ 1040 } 1041 1042 sub reduction_avx { 1043 my ($Xhi,$Xi) = @_; 1044 1045 $code.=<<___; 1046 vpsllq \$57,$Xi,$T1 # 1st phase 1047 vpsllq \$62,$Xi,$T2 1048 vpxor $T1,$T2,$T2 # 1049 vpsllq \$63,$Xi,$T1 1050 vpxor $T1,$T2,$T2 # 1051 vpslldq \$8,$T2,$T1 # 1052 vpsrldq \$8,$T2,$T2 1053 vpxor $T1,$Xi,$Xi # 1054 vpxor $T2,$Xhi,$Xhi 1055 1056 vpsrlq \$1,$Xi,$T2 # 2nd phase 1057 vpxor $Xi,$Xhi,$Xhi 1058 vpxor $T2,$Xi,$Xi # 1059 vpsrlq \$5,$T2,$T2 1060 vpxor $T2,$Xi,$Xi # 1061 vpsrlq \$1,$Xi,$Xi # 1062 vpxor $Xhi,$Xi,$Xi # 1063 ___ 1064 } 1065 1066 $code.=<<___; 1067 .align 32 1068 .Linit_loop_avx: 1069 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 1070 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 1071 ___ 1072 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 1073 &reduction_avx ($Xhi,$Xi); 1074 $code.=<<___; 1075 .Linit_start_avx: 1076 vmovdqa $Xi,$T3 1077 ___ 1078 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 1079 &reduction_avx ($Xhi,$Xi); 1080 $code.=<<___; 1081 vpshufd \$0b01001110,$T3,$T1 1082 vpshufd \$0b01001110,$Xi,$T2 1083 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 1084 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 1085 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 1086 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 1087 lea 0x30($Htbl),$Htbl 1088 sub \$1,%r10 1089 jnz .Linit_loop_avx 1090 1091 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 1092 vmovdqu $T3,-0x10($Htbl) 1093 1094 vzeroupper 1095 ___ 1096 $code.=<<___ if ($win64); 1097 movaps (%rsp),%xmm6 1098 lea 0x18(%rsp),%rsp 1099 .LSEH_end_gcm_init_avx: 1100 ___ 1101 $code.=<<___; 1102 ret 1103 .size gcm_init_avx,.-gcm_init_avx 1104 ___ 1105 } else { 1106 $code.=<<___; 1107 jmp .L_init_clmul 1108 .size gcm_init_avx,.-gcm_init_avx 1109 ___ 1110 } 1111 1112 $code.=<<___; 1113 .globl gcm_gmult_avx 1114 .type gcm_gmult_avx,\@abi-omnipotent 1115 .align 32 1116 gcm_gmult_avx: 1117 jmp .L_gmult_clmul 1118 .size gcm_gmult_avx,.-gcm_gmult_avx 1119 ___ 1120 1122 $code.=<<___; 1123 .globl gcm_ghash_avx 1124 .type gcm_ghash_avx,\@abi-omnipotent 1125 .align 32 1126 gcm_ghash_avx: 1127 ___ 1128 if ($avx) { 1129 my ($Xip,$Htbl,$inp,$len)=@_4args; 1130 my ($Xlo,$Xhi,$Xmi, 1131 $Zlo,$Zhi,$Zmi, 1132 $Hkey,$HK,$T1,$T2, 1133 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 1134 1135 $code.=<<___ if ($win64); 1136 lea -0x88(%rsp),%rax 1137 .LSEH_begin_gcm_ghash_avx: 1138 # I can't trust assembler to use specific encoding:-( 1139 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1140 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 1141 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 1142 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 1143 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 1144 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 1145 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 1146 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 1147 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 1148 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 1149 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 1150 ___ 1151 $code.=<<___; 1152 vzeroupper 1153 1154 vmovdqu ($Xip),$Xi # load $Xi 1155 lea .L0x1c2_polynomial(%rip),%r10 1156 lea 0x40($Htbl),$Htbl # size optimization 1157 vmovdqu .Lbswap_mask(%rip),$bswap 1158 vpshufb $bswap,$Xi,$Xi 1159 cmp \$0x80,$len 1160 jb .Lshort_avx 1161 sub \$0x80,$len 1162 1163 vmovdqu 0x70($inp),$Ii # I[7] 1164 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1165 vpshufb $bswap,$Ii,$Ii 1166 vmovdqu 0x20-0x40($Htbl),$HK 1167 1168 vpunpckhqdq $Ii,$Ii,$T2 1169 vmovdqu 0x60($inp),$Ij # I[6] 1170 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1171 vpxor $Ii,$T2,$T2 1172 vpshufb $bswap,$Ij,$Ij 1173 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1174 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1175 vpunpckhqdq $Ij,$Ij,$T1 1176 vmovdqu 0x50($inp),$Ii # I[5] 1177 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1178 vpxor $Ij,$T1,$T1 1179 1180 vpshufb $bswap,$Ii,$Ii 1181 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1182 vpunpckhqdq $Ii,$Ii,$T2 1183 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1184 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1185 vpxor $Ii,$T2,$T2 1186 vmovdqu 0x40($inp),$Ij # I[4] 1187 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1188 vmovdqu 0x50-0x40($Htbl),$HK 1189 1190 vpshufb $bswap,$Ij,$Ij 1191 vpxor $Xlo,$Zlo,$Zlo 1192 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1193 vpxor $Xhi,$Zhi,$Zhi 1194 vpunpckhqdq $Ij,$Ij,$T1 1195 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1196 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1197 vpxor $Xmi,$Zmi,$Zmi 1198 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1199 vpxor $Ij,$T1,$T1 1200 1201 vmovdqu 0x30($inp),$Ii # I[3] 1202 vpxor $Zlo,$Xlo,$Xlo 1203 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1204 vpxor $Zhi,$Xhi,$Xhi 1205 vpshufb $bswap,$Ii,$Ii 1206 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1207 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1208 vpxor $Zmi,$Xmi,$Xmi 1209 vpunpckhqdq $Ii,$Ii,$T2 1210 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1211 vmovdqu 0x80-0x40($Htbl),$HK 1212 vpxor $Ii,$T2,$T2 1213 1214 vmovdqu 0x20($inp),$Ij # I[2] 1215 vpxor $Xlo,$Zlo,$Zlo 1216 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1217 vpxor $Xhi,$Zhi,$Zhi 1218 vpshufb $bswap,$Ij,$Ij 1219 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1220 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1221 vpxor $Xmi,$Zmi,$Zmi 1222 vpunpckhqdq $Ij,$Ij,$T1 1223 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1224 vpxor $Ij,$T1,$T1 1225 1226 vmovdqu 0x10($inp),$Ii # I[1] 1227 vpxor $Zlo,$Xlo,$Xlo 1228 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1229 vpxor $Zhi,$Xhi,$Xhi 1230 vpshufb $bswap,$Ii,$Ii 1231 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1232 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1233 vpxor $Zmi,$Xmi,$Xmi 1234 vpunpckhqdq $Ii,$Ii,$T2 1235 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1236 vmovdqu 0xb0-0x40($Htbl),$HK 1237 vpxor $Ii,$T2,$T2 1238 1239 vmovdqu ($inp),$Ij # I[0] 1240 vpxor $Xlo,$Zlo,$Zlo 1241 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1242 vpxor $Xhi,$Zhi,$Zhi 1243 vpshufb $bswap,$Ij,$Ij 1244 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1245 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1246 vpxor $Xmi,$Zmi,$Zmi 1247 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1248 1249 lea 0x80($inp),$inp 1250 cmp \$0x80,$len 1251 jb .Ltail_avx 1252 1253 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1254 sub \$0x80,$len 1255 jmp .Loop8x_avx 1256 1257 .align 32 1258 .Loop8x_avx: 1259 vpunpckhqdq $Ij,$Ij,$T1 1260 vmovdqu 0x70($inp),$Ii # I[7] 1261 vpxor $Xlo,$Zlo,$Zlo 1262 vpxor $Ij,$T1,$T1 1263 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1264 vpshufb $bswap,$Ii,$Ii 1265 vpxor $Xhi,$Zhi,$Zhi 1266 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1267 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1268 vpunpckhqdq $Ii,$Ii,$T2 1269 vpxor $Xmi,$Zmi,$Zmi 1270 vpclmulqdq \$0x00,$HK,$T1,$Tred 1271 vmovdqu 0x20-0x40($Htbl),$HK 1272 vpxor $Ii,$T2,$T2 1273 1274 vmovdqu 0x60($inp),$Ij # I[6] 1275 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1276 vpxor $Zlo,$Xi,$Xi # collect result 1277 vpshufb $bswap,$Ij,$Ij 1278 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1279 vxorps $Zhi,$Xo,$Xo 1280 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1281 vpunpckhqdq $Ij,$Ij,$T1 1282 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1283 vpxor $Zmi,$Tred,$Tred 1284 vxorps $Ij,$T1,$T1 1285 1286 vmovdqu 0x50($inp),$Ii # I[5] 1287 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1288 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1289 vpxor $Xo,$Tred,$Tred 1290 vpslldq \$8,$Tred,$T2 1291 vpxor $Xlo,$Zlo,$Zlo 1292 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1293 vpsrldq \$8,$Tred,$Tred 1294 vpxor $T2, $Xi, $Xi 1295 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1296 vpshufb $bswap,$Ii,$Ii 1297 vxorps $Tred,$Xo, $Xo 1298 vpxor $Xhi,$Zhi,$Zhi 1299 vpunpckhqdq $Ii,$Ii,$T2 1300 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1301 vmovdqu 0x50-0x40($Htbl),$HK 1302 vpxor $Ii,$T2,$T2 1303 vpxor $Xmi,$Zmi,$Zmi 1304 1305 vmovdqu 0x40($inp),$Ij # I[4] 1306 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1307 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1308 vpshufb $bswap,$Ij,$Ij 1309 vpxor $Zlo,$Xlo,$Xlo 1310 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1311 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1312 vpunpckhqdq $Ij,$Ij,$T1 1313 vpxor $Zhi,$Xhi,$Xhi 1314 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1315 vxorps $Ij,$T1,$T1 1316 vpxor $Zmi,$Xmi,$Xmi 1317 1318 vmovdqu 0x30($inp),$Ii # I[3] 1319 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1320 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1321 vpshufb $bswap,$Ii,$Ii 1322 vpxor $Xlo,$Zlo,$Zlo 1323 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1324 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1325 vpunpckhqdq $Ii,$Ii,$T2 1326 vpxor $Xhi,$Zhi,$Zhi 1327 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1328 vmovdqu 0x80-0x40($Htbl),$HK 1329 vpxor $Ii,$T2,$T2 1330 vpxor $Xmi,$Zmi,$Zmi 1331 1332 vmovdqu 0x20($inp),$Ij # I[2] 1333 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1334 vpshufb $bswap,$Ij,$Ij 1335 vpxor $Zlo,$Xlo,$Xlo 1336 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1337 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1338 vpunpckhqdq $Ij,$Ij,$T1 1339 vpxor $Zhi,$Xhi,$Xhi 1340 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1341 vpxor $Ij,$T1,$T1 1342 vpxor $Zmi,$Xmi,$Xmi 1343 vxorps $Tred,$Xi,$Xi 1344 1345 vmovdqu 0x10($inp),$Ii # I[1] 1346 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1347 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1348 vpshufb $bswap,$Ii,$Ii 1349 vpxor $Xlo,$Zlo,$Zlo 1350 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1351 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1352 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1353 vxorps $Xo,$Tred,$Tred 1354 vpunpckhqdq $Ii,$Ii,$T2 1355 vpxor $Xhi,$Zhi,$Zhi 1356 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1357 vmovdqu 0xb0-0x40($Htbl),$HK 1358 vpxor $Ii,$T2,$T2 1359 vpxor $Xmi,$Zmi,$Zmi 1360 1361 vmovdqu ($inp),$Ij # I[0] 1362 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1363 vpshufb $bswap,$Ij,$Ij 1364 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1365 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1366 vpxor $Tred,$Ij,$Ij 1367 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1368 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1369 1370 lea 0x80($inp),$inp 1371 sub \$0x80,$len 1372 jnc .Loop8x_avx 1373 1374 add \$0x80,$len 1375 jmp .Ltail_no_xor_avx 1376 1377 .align 32 1378 .Lshort_avx: 1379 vmovdqu -0x10($inp,$len),$Ii # very last word 1380 lea ($inp,$len),$inp 1381 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1382 vmovdqu 0x20-0x40($Htbl),$HK 1383 vpshufb $bswap,$Ii,$Ij 1384 1385 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1386 vmovdqa $Xhi,$Zhi # $Zhi and 1387 vmovdqa $Xmi,$Zmi # $Zmi 1388 sub \$0x10,$len 1389 jz .Ltail_avx 1390 1391 vpunpckhqdq $Ij,$Ij,$T1 1392 vpxor $Xlo,$Zlo,$Zlo 1393 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1394 vpxor $Ij,$T1,$T1 1395 vmovdqu -0x20($inp),$Ii 1396 vpxor $Xhi,$Zhi,$Zhi 1397 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1398 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1399 vpshufb $bswap,$Ii,$Ij 1400 vpxor $Xmi,$Zmi,$Zmi 1401 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1402 vpsrldq \$8,$HK,$HK 1403 sub \$0x10,$len 1404 jz .Ltail_avx 1405 1406 vpunpckhqdq $Ij,$Ij,$T1 1407 vpxor $Xlo,$Zlo,$Zlo 1408 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1409 vpxor $Ij,$T1,$T1 1410 vmovdqu -0x30($inp),$Ii 1411 vpxor $Xhi,$Zhi,$Zhi 1412 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1413 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1414 vpshufb $bswap,$Ii,$Ij 1415 vpxor $Xmi,$Zmi,$Zmi 1416 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1417 vmovdqu 0x50-0x40($Htbl),$HK 1418 sub \$0x10,$len 1419 jz .Ltail_avx 1420 1421 vpunpckhqdq $Ij,$Ij,$T1 1422 vpxor $Xlo,$Zlo,$Zlo 1423 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1424 vpxor $Ij,$T1,$T1 1425 vmovdqu -0x40($inp),$Ii 1426 vpxor $Xhi,$Zhi,$Zhi 1427 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1428 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1429 vpshufb $bswap,$Ii,$Ij 1430 vpxor $Xmi,$Zmi,$Zmi 1431 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1432 vpsrldq \$8,$HK,$HK 1433 sub \$0x10,$len 1434 jz .Ltail_avx 1435 1436 vpunpckhqdq $Ij,$Ij,$T1 1437 vpxor $Xlo,$Zlo,$Zlo 1438 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1439 vpxor $Ij,$T1,$T1 1440 vmovdqu -0x50($inp),$Ii 1441 vpxor $Xhi,$Zhi,$Zhi 1442 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1443 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1444 vpshufb $bswap,$Ii,$Ij 1445 vpxor $Xmi,$Zmi,$Zmi 1446 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1447 vmovdqu 0x80-0x40($Htbl),$HK 1448 sub \$0x10,$len 1449 jz .Ltail_avx 1450 1451 vpunpckhqdq $Ij,$Ij,$T1 1452 vpxor $Xlo,$Zlo,$Zlo 1453 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1454 vpxor $Ij,$T1,$T1 1455 vmovdqu -0x60($inp),$Ii 1456 vpxor $Xhi,$Zhi,$Zhi 1457 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1458 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1459 vpshufb $bswap,$Ii,$Ij 1460 vpxor $Xmi,$Zmi,$Zmi 1461 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1462 vpsrldq \$8,$HK,$HK 1463 sub \$0x10,$len 1464 jz .Ltail_avx 1465 1466 vpunpckhqdq $Ij,$Ij,$T1 1467 vpxor $Xlo,$Zlo,$Zlo 1468 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1469 vpxor $Ij,$T1,$T1 1470 vmovdqu -0x70($inp),$Ii 1471 vpxor $Xhi,$Zhi,$Zhi 1472 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1473 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1474 vpshufb $bswap,$Ii,$Ij 1475 vpxor $Xmi,$Zmi,$Zmi 1476 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1477 vmovq 0xb8-0x40($Htbl),$HK 1478 sub \$0x10,$len 1479 jmp .Ltail_avx 1480 1481 .align 32 1482 .Ltail_avx: 1483 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1484 .Ltail_no_xor_avx: 1485 vpunpckhqdq $Ij,$Ij,$T1 1486 vpxor $Xlo,$Zlo,$Zlo 1487 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1488 vpxor $Ij,$T1,$T1 1489 vpxor $Xhi,$Zhi,$Zhi 1490 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1491 vpxor $Xmi,$Zmi,$Zmi 1492 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1493 1494 vmovdqu (%r10),$Tred 1495 1496 vpxor $Xlo,$Zlo,$Xi 1497 vpxor $Xhi,$Zhi,$Xo 1498 vpxor $Xmi,$Zmi,$Zmi 1499 1500 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1501 vpxor $Xo, $Zmi,$Zmi 1502 vpslldq \$8, $Zmi,$T2 1503 vpsrldq \$8, $Zmi,$Zmi 1504 vpxor $T2, $Xi, $Xi 1505 vpxor $Zmi,$Xo, $Xo 1506 1507 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1508 vpalignr \$8,$Xi,$Xi,$Xi 1509 vpxor $T2,$Xi,$Xi 1510 1511 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1512 vpalignr \$8,$Xi,$Xi,$Xi 1513 vpxor $Xo,$Xi,$Xi 1514 vpxor $T2,$Xi,$Xi 1515 1516 cmp \$0,$len 1517 jne .Lshort_avx 1518 1519 vpshufb $bswap,$Xi,$Xi 1520 vmovdqu $Xi,($Xip) 1521 vzeroupper 1522 ___ 1523 $code.=<<___ if ($win64); 1524 movaps (%rsp),%xmm6 1525 movaps 0x10(%rsp),%xmm7 1526 movaps 0x20(%rsp),%xmm8 1527 movaps 0x30(%rsp),%xmm9 1528 movaps 0x40(%rsp),%xmm10 1529 movaps 0x50(%rsp),%xmm11 1530 movaps 0x60(%rsp),%xmm12 1531 movaps 0x70(%rsp),%xmm13 1532 movaps 0x80(%rsp),%xmm14 1533 movaps 0x90(%rsp),%xmm15 1534 lea 0xa8(%rsp),%rsp 1535 .LSEH_end_gcm_ghash_avx: 1536 ___ 1537 $code.=<<___; 1538 ret 1539 .size gcm_ghash_avx,.-gcm_ghash_avx 1540 ___ 1541 } else { 1542 $code.=<<___; 1543 jmp .L_ghash_clmul 1544 .size gcm_ghash_avx,.-gcm_ghash_avx 1545 ___ 1546 } 1547 1549 $code.=<<___; 1550 .align 64 1551 .Lbswap_mask: 1552 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1553 .L0x1c2_polynomial: 1554 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1555 .L7_mask: 1556 .long 7,0,7,0 1557 .L7_mask_poly: 1558 .long 7,0,`0xE1<<1`,0 1559 .align 64 1560 .type .Lrem_4bit,\@object 1561 .Lrem_4bit: 1562 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` 1563 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` 1564 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` 1565 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` 1566 .type .Lrem_8bit,\@object 1567 .Lrem_8bit: 1568 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 1569 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 1570 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 1571 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 1572 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 1573 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 1574 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 1575 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 1576 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 1577 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 1578 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 1579 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 1580 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 1581 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 1582 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 1583 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 1584 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 1585 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 1586 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 1587 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 1588 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 1589 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 1590 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 1591 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 1592 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 1593 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 1594 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 1595 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 1596 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 1597 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 1598 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 1599 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 1600 1601 .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1602 .align 64 1603 ___ 1604 1606 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1607 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 1608 if ($win64) { 1609 $rec="%rcx"; 1610 $frame="%rdx"; 1611 $context="%r8"; 1612 $disp="%r9"; 1613 1614 $code.=<<___; 1615 .extern __imp_RtlVirtualUnwind 1616 .type se_handler,\@abi-omnipotent 1617 .align 16 1618 se_handler: 1619 push %rsi 1620 push %rdi 1621 push %rbx 1622 push %rbp 1623 push %r12 1624 push %r13 1625 push %r14 1626 push %r15 1627 pushfq 1628 sub \$64,%rsp 1629 1630 mov 120($context),%rax # pull context->Rax 1631 mov 248($context),%rbx # pull context->Rip 1632 1633 mov 8($disp),%rsi # disp->ImageBase 1634 mov 56($disp),%r11 # disp->HandlerData 1635 1636 mov 0(%r11),%r10d # HandlerData[0] 1637 lea (%rsi,%r10),%r10 # prologue label 1638 cmp %r10,%rbx # context->Rip<prologue label 1639 jb .Lin_prologue 1640 1641 mov 152($context),%rax # pull context->Rsp 1642 1643 mov 4(%r11),%r10d # HandlerData[1] 1644 lea (%rsi,%r10),%r10 # epilogue label 1645 cmp %r10,%rbx # context->Rip>=epilogue label 1646 jae .Lin_prologue 1647 1648 lea 24(%rax),%rax # adjust "rsp" 1649 1650 mov -8(%rax),%rbx 1651 mov -16(%rax),%rbp 1652 mov -24(%rax),%r12 1653 mov %rbx,144($context) # restore context->Rbx 1654 mov %rbp,160($context) # restore context->Rbp 1655 mov %r12,216($context) # restore context->R12 1656 1657 .Lin_prologue: 1658 mov 8(%rax),%rdi 1659 mov 16(%rax),%rsi 1660 mov %rax,152($context) # restore context->Rsp 1661 mov %rsi,168($context) # restore context->Rsi 1662 mov %rdi,176($context) # restore context->Rdi 1663 1664 mov 40($disp),%rdi # disp->ContextRecord 1665 mov $context,%rsi # context 1666 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1667 .long 0xa548f3fc # cld; rep movsq 1668 1669 mov $disp,%rsi 1670 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1671 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1672 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1673 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1674 mov 40(%rsi),%r10 # disp->ContextRecord 1675 lea 56(%rsi),%r11 # &disp->HandlerData 1676 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1677 mov %r10,32(%rsp) # arg5 1678 mov %r11,40(%rsp) # arg6 1679 mov %r12,48(%rsp) # arg7 1680 mov %rcx,56(%rsp) # arg8, (NULL) 1681 call *__imp_RtlVirtualUnwind(%rip) 1682 1683 mov \$1,%eax # ExceptionContinueSearch 1684 add \$64,%rsp 1685 popfq 1686 pop %r15 1687 pop %r14 1688 pop %r13 1689 pop %r12 1690 pop %rbp 1691 pop %rbx 1692 pop %rdi 1693 pop %rsi 1694 ret 1695 .size se_handler,.-se_handler 1696 1697 .section .pdata 1698 .align 4 1699 .rva .LSEH_begin_gcm_gmult_4bit 1700 .rva .LSEH_end_gcm_gmult_4bit 1701 .rva .LSEH_info_gcm_gmult_4bit 1702 1703 .rva .LSEH_begin_gcm_ghash_4bit 1704 .rva .LSEH_end_gcm_ghash_4bit 1705 .rva .LSEH_info_gcm_ghash_4bit 1706 1707 .rva .LSEH_begin_gcm_init_clmul 1708 .rva .LSEH_end_gcm_init_clmul 1709 .rva .LSEH_info_gcm_init_clmul 1710 1711 .rva .LSEH_begin_gcm_ghash_clmul 1712 .rva .LSEH_end_gcm_ghash_clmul 1713 .rva .LSEH_info_gcm_ghash_clmul 1714 ___ 1715 $code.=<<___ if ($avx); 1716 .rva .LSEH_begin_gcm_init_avx 1717 .rva .LSEH_end_gcm_init_avx 1718 .rva .LSEH_info_gcm_init_clmul 1719 1720 .rva .LSEH_begin_gcm_ghash_avx 1721 .rva .LSEH_end_gcm_ghash_avx 1722 .rva .LSEH_info_gcm_ghash_clmul 1723 ___ 1724 $code.=<<___; 1725 .section .xdata 1726 .align 8 1727 .LSEH_info_gcm_gmult_4bit: 1728 .byte 9,0,0,0 1729 .rva se_handler 1730 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData 1731 .LSEH_info_gcm_ghash_4bit: 1732 .byte 9,0,0,0 1733 .rva se_handler 1734 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData 1735 .LSEH_info_gcm_init_clmul: 1736 .byte 0x01,0x08,0x03,0x00 1737 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1738 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1739 .LSEH_info_gcm_ghash_clmul: 1740 .byte 0x01,0x33,0x16,0x00 1741 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1742 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1743 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1744 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1745 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1746 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1747 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1748 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1749 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1750 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1751 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1752 ___ 1753 } 1754 1756 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 1757 1758 print $code; 1759 1760 close STDOUT; 1761