1 #! /usr/bin/env perl 2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 # 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # ==================================================================== 16 # 17 # March, June 2010 18 # 19 # The module implements "4-bit" GCM GHASH function and underlying 20 # single multiplication operation in GF(2^128). "4-bit" means that 21 # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22 # function features so called "528B" variant utilizing additional 23 # 256+16 bytes of per-key storage [+512 bytes shared table]. 24 # Performance results are for this streamed GHASH subroutine and are 25 # expressed in cycles per processed byte, less is better: 26 # 27 # gcc 3.4.x(*) assembler 28 # 29 # P4 28.6 14.0 +100% 30 # Opteron 19.3 7.7 +150% 31 # Core2 17.8 8.1(**) +120% 32 # Atom 31.6 16.8 +88% 33 # VIA Nano 21.8 10.1 +115% 34 # 35 # (*) comparison is not completely fair, because C results are 36 # for vanilla "256B" implementation, while assembler results 37 # are for "528B";-) 38 # (**) it's mystery [to me] why Core2 result is not same as for 39 # Opteron; 40 41 # May 2010 42 # 43 # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44 # See ghash-x86.pl for background information and details about coding 45 # techniques. 46 # 47 # Special thanks to David Woodhouse for providing access to a 48 # Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50 # December 2012 51 # 52 # Overhaul: aggregate Karatsuba post-processing, improve ILP in 53 # reduction_alg9, increase reduction aggregate factor to 4x. As for 54 # the latter. ghash-x86.pl discusses that it makes lesser sense to 55 # increase aggregate factor. Then why increase here? Critical path 56 # consists of 3 independent pclmulqdq instructions, Karatsuba post- 57 # processing and reduction. "On top" of this we lay down aggregated 58 # multiplication operations, triplets of independent pclmulqdq's. As 59 # issue rate for pclmulqdq is limited, it makes lesser sense to 60 # aggregate more multiplications than it takes to perform remaining 61 # non-multiplication operations. 2x is near-optimal coefficient for 62 # contemporary Intel CPUs (therefore modest improvement coefficient), 63 # but not for Bulldozer. Latter is because logical SIMD operations 64 # are twice as slow in comparison to Intel, so that critical path is 65 # longer. A CPU with higher pclmulqdq issue rate would also benefit 66 # from higher aggregate factor... 67 # 68 # Westmere 1.78(+13%) 69 # Sandy Bridge 1.80(+8%) 70 # Ivy Bridge 1.80(+7%) 71 # Haswell 0.55(+93%) (if system doesn't support AVX) 72 # Broadwell 0.45(+110%)(if system doesn't support AVX) 73 # Skylake 0.44(+110%)(if system doesn't support AVX) 74 # Bulldozer 1.49(+27%) 75 # Silvermont 2.88(+13%) 76 # Knights L 2.12(-) (if system doesn't support AVX) 77 # Goldmont 1.08(+24%) 78 79 # March 2013 80 # 81 # ... 8x aggregate factor AVX code path is using reduction algorithm 82 # suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84 # sub-optimally in comparison to above mentioned version. But thanks 85 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86 # it performs in 0.41 cycles per byte on Haswell processor, in 87 # 0.29 on Broadwell, and in 0.36 on Skylake. 88 # 89 # Knights Landing achieves 1.09 cpb. 90 # 91 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93 $flavour = shift; 94 $output = shift; 95 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 96 97 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 98 99 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 100 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 101 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 102 die "can't locate x86_64-xlate.pl"; 103 104 # See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be 105 # computed incorrectly. 106 # 107 # In upstream, this is controlled by shelling out to the compiler to check 108 # versions, but BoringSSL is intended to be used with pre-generated perlasm 109 # output, so this isn't useful anyway. 110 $avx = 1; 111 112 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 113 *STDOUT=*OUT; 114 115 $do4xaggr=1; 116 117 # common register layout 118 $nlo="%rax"; 119 $nhi="%rbx"; 120 $Zlo="%r8"; 121 $Zhi="%r9"; 122 $tmp="%r10"; 123 $rem_4bit = "%r11"; 124 125 $Xi="%rdi"; 126 $Htbl="%rsi"; 127 128 # per-function register layout 129 $cnt="%rcx"; 130 $rem="%rdx"; 131 132 sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or 133 $r =~ s/%[er]([sd]i)/%\1l/ or 134 $r =~ s/%[er](bp)/%\1l/ or 135 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 136 137 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 138 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 139 my $arg = pop; 140 $arg = "\$$arg" if ($arg*1 eq $arg); 141 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 142 } 143 145 { my $N; 146 sub loop() { 147 my $inp = shift; 148 149 $N++; 150 $code.=<<___; 151 xor $nlo,$nlo 152 xor $nhi,$nhi 153 mov `&LB("$Zlo")`,`&LB("$nlo")` 154 mov `&LB("$Zlo")`,`&LB("$nhi")` 155 shl \$4,`&LB("$nlo")` 156 mov \$14,$cnt 157 mov 8($Htbl,$nlo),$Zlo 158 mov ($Htbl,$nlo),$Zhi 159 and \$0xf0,`&LB("$nhi")` 160 mov $Zlo,$rem 161 jmp .Loop$N 162 163 .align 16 164 .Loop$N: 165 shr \$4,$Zlo 166 and \$0xf,$rem 167 mov $Zhi,$tmp 168 mov ($inp,$cnt),`&LB("$nlo")` 169 shr \$4,$Zhi 170 xor 8($Htbl,$nhi),$Zlo 171 shl \$60,$tmp 172 xor ($Htbl,$nhi),$Zhi 173 mov `&LB("$nlo")`,`&LB("$nhi")` 174 xor ($rem_4bit,$rem,8),$Zhi 175 mov $Zlo,$rem 176 shl \$4,`&LB("$nlo")` 177 xor $tmp,$Zlo 178 dec $cnt 179 js .Lbreak$N 180 181 shr \$4,$Zlo 182 and \$0xf,$rem 183 mov $Zhi,$tmp 184 shr \$4,$Zhi 185 xor 8($Htbl,$nlo),$Zlo 186 shl \$60,$tmp 187 xor ($Htbl,$nlo),$Zhi 188 and \$0xf0,`&LB("$nhi")` 189 xor ($rem_4bit,$rem,8),$Zhi 190 mov $Zlo,$rem 191 xor $tmp,$Zlo 192 jmp .Loop$N 193 194 .align 16 195 .Lbreak$N: 196 shr \$4,$Zlo 197 and \$0xf,$rem 198 mov $Zhi,$tmp 199 shr \$4,$Zhi 200 xor 8($Htbl,$nlo),$Zlo 201 shl \$60,$tmp 202 xor ($Htbl,$nlo),$Zhi 203 and \$0xf0,`&LB("$nhi")` 204 xor ($rem_4bit,$rem,8),$Zhi 205 mov $Zlo,$rem 206 xor $tmp,$Zlo 207 208 shr \$4,$Zlo 209 and \$0xf,$rem 210 mov $Zhi,$tmp 211 shr \$4,$Zhi 212 xor 8($Htbl,$nhi),$Zlo 213 shl \$60,$tmp 214 xor ($Htbl,$nhi),$Zhi 215 xor $tmp,$Zlo 216 xor ($rem_4bit,$rem,8),$Zhi 217 218 bswap $Zlo 219 bswap $Zhi 220 ___ 221 }} 222 223 $code=<<___; 224 .text 225 .extern OPENSSL_ia32cap_P 226 227 .globl gcm_gmult_4bit 228 .type gcm_gmult_4bit,\@function,2 229 .align 16 230 gcm_gmult_4bit: 231 .cfi_startproc 232 push %rbx 233 .cfi_push %rbx 234 push %rbp # %rbp and others are pushed exclusively in 235 .cfi_push %rbp 236 push %r12 # order to reuse Win64 exception handler... 237 .cfi_push %r12 238 push %r13 239 .cfi_push %r13 240 push %r14 241 .cfi_push %r14 242 push %r15 243 .cfi_push %r15 244 sub \$280,%rsp 245 .cfi_adjust_cfa_offset 280 246 .Lgmult_prologue: 247 248 movzb 15($Xi),$Zlo 249 lea .Lrem_4bit(%rip),$rem_4bit 250 ___ 251 &loop ($Xi); 252 $code.=<<___; 253 mov $Zlo,8($Xi) 254 mov $Zhi,($Xi) 255 256 lea 280+48(%rsp),%rsi 257 .cfi_def_cfa %rsi,8 258 mov -8(%rsi),%rbx 259 .cfi_restore %rbx 260 lea (%rsi),%rsp 261 .cfi_def_cfa_register %rsp 262 .Lgmult_epilogue: 263 ret 264 .cfi_endproc 265 .size gcm_gmult_4bit,.-gcm_gmult_4bit 266 ___ 267 269 # per-function register layout 270 $inp="%rdx"; 271 $len="%rcx"; 272 $rem_8bit=$rem_4bit; 273 274 $code.=<<___; 275 .globl gcm_ghash_4bit 276 .type gcm_ghash_4bit,\@function,4 277 .align 16 278 gcm_ghash_4bit: 279 .cfi_startproc 280 push %rbx 281 .cfi_push %rbx 282 push %rbp 283 .cfi_push %rbp 284 push %r12 285 .cfi_push %r12 286 push %r13 287 .cfi_push %r13 288 push %r14 289 .cfi_push %r14 290 push %r15 291 .cfi_push %r15 292 sub \$280,%rsp 293 .cfi_adjust_cfa_offset 280 294 .Lghash_prologue: 295 mov $inp,%r14 # reassign couple of args 296 mov $len,%r15 297 ___ 298 { my $inp="%r14"; 299 my $dat="%edx"; 300 my $len="%r15"; 301 my @nhi=("%ebx","%ecx"); 302 my @rem=("%r12","%r13"); 303 my $Hshr4="%rbp"; 304 305 &sub ($Htbl,-128); # size optimization 306 &lea ($Hshr4,"16+128(%rsp)"); 307 { my @lo =($nlo,$nhi); 308 my @hi =($Zlo,$Zhi); 309 310 &xor ($dat,$dat); 311 for ($i=0,$j=-2;$i<18;$i++,$j++) { 312 &mov ("$j(%rsp)",&LB($dat)) if ($i>1); 313 &or ($lo[0],$tmp) if ($i>1); 314 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); 315 &shr ($lo[1],4) if ($i>0 && $i<17); 316 &mov ($tmp,$hi[1]) if ($i>0 && $i<17); 317 &shr ($hi[1],4) if ($i>0 && $i<17); 318 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); 319 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); 320 &shl (&LB($dat),4) if ($i>0 && $i<17); 321 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); 322 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); 323 &shl ($tmp,60) if ($i>0 && $i<17); 324 325 push (@lo,shift(@lo)); 326 push (@hi,shift(@hi)); 327 } 328 } 329 &add ($Htbl,-128); 330 &mov ($Zlo,"8($Xi)"); 331 &mov ($Zhi,"0($Xi)"); 332 &add ($len,$inp); # pointer to the end of data 333 &lea ($rem_8bit,".Lrem_8bit(%rip)"); 334 &jmp (".Louter_loop"); 335 336 $code.=".align 16\n.Louter_loop:\n"; 337 &xor ($Zhi,"($inp)"); 338 &mov ("%rdx","8($inp)"); 339 &lea ($inp,"16($inp)"); 340 &xor ("%rdx",$Zlo); 341 &mov ("($Xi)",$Zhi); 342 &mov ("8($Xi)","%rdx"); 343 &shr ("%rdx",32); 344 345 &xor ($nlo,$nlo); 346 &rol ($dat,8); 347 &mov (&LB($nlo),&LB($dat)); 348 &movz ($nhi[0],&LB($dat)); 349 &shl (&LB($nlo),4); 350 &shr ($nhi[0],4); 351 352 for ($j=11,$i=0;$i<15;$i++) { 353 &rol ($dat,8); 354 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); 355 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); 356 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); 357 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); 358 359 &mov (&LB($nlo),&LB($dat)); 360 &xor ($Zlo,$tmp) if ($i>0); 361 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); 362 363 &movz ($nhi[1],&LB($dat)); 364 &shl (&LB($nlo),4); 365 &movzb ($rem[0],"(%rsp,$nhi[0])"); 366 367 &shr ($nhi[1],4) if ($i<14); 368 &and ($nhi[1],0xf0) if ($i==14); 369 &shl ($rem[1],48) if ($i>0); 370 &xor ($rem[0],$Zlo); 371 372 &mov ($tmp,$Zhi); 373 &xor ($Zhi,$rem[1]) if ($i>0); 374 &shr ($Zlo,8); 375 376 &movz ($rem[0],&LB($rem[0])); 377 &mov ($dat,"$j($Xi)") if (--$j%4==0); 378 &shr ($Zhi,8); 379 380 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); 381 &shl ($tmp,56); 382 &xor ($Zhi,"($Hshr4,$nhi[0],8)"); 383 384 unshift (@nhi,pop(@nhi)); # "rotate" registers 385 unshift (@rem,pop(@rem)); 386 } 387 &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); 388 &xor ($Zlo,"8($Htbl,$nlo)"); 389 &xor ($Zhi,"($Htbl,$nlo)"); 390 391 &shl ($rem[1],48); 392 &xor ($Zlo,$tmp); 393 394 &xor ($Zhi,$rem[1]); 395 &movz ($rem[0],&LB($Zlo)); 396 &shr ($Zlo,4); 397 398 &mov ($tmp,$Zhi); 399 &shl (&LB($rem[0]),4); 400 &shr ($Zhi,4); 401 402 &xor ($Zlo,"8($Htbl,$nhi[0])"); 403 &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); 404 &shl ($tmp,60); 405 406 &xor ($Zhi,"($Htbl,$nhi[0])"); 407 &xor ($Zlo,$tmp); 408 &shl ($rem[0],48); 409 410 &bswap ($Zlo); 411 &xor ($Zhi,$rem[0]); 412 413 &bswap ($Zhi); 414 &cmp ($inp,$len); 415 &jb (".Louter_loop"); 416 } 417 $code.=<<___; 418 mov $Zlo,8($Xi) 419 mov $Zhi,($Xi) 420 421 lea 280+48(%rsp),%rsi 422 .cfi_def_cfa %rsi,8 423 mov -48(%rsi),%r15 424 .cfi_restore %r15 425 mov -40(%rsi),%r14 426 .cfi_restore %r14 427 mov -32(%rsi),%r13 428 .cfi_restore %r13 429 mov -24(%rsi),%r12 430 .cfi_restore %r12 431 mov -16(%rsi),%rbp 432 .cfi_restore %rbp 433 mov -8(%rsi),%rbx 434 .cfi_restore %rbx 435 lea 0(%rsi),%rsp 436 .cfi_def_cfa_register %rsp 437 .Lghash_epilogue: 438 ret 439 .cfi_endproc 440 .size gcm_ghash_4bit,.-gcm_ghash_4bit 441 ___ 442 444 ###################################################################### 445 # PCLMULQDQ version. 446 447 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 448 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 449 450 ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 451 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 452 453 sub clmul64x64_T2 { # minimal register pressure 454 my ($Xhi,$Xi,$Hkey,$HK)=@_; 455 456 if (!defined($HK)) { $HK = $T2; 457 $code.=<<___; 458 movdqa $Xi,$Xhi # 459 pshufd \$0b01001110,$Xi,$T1 460 pshufd \$0b01001110,$Hkey,$T2 461 pxor $Xi,$T1 # 462 pxor $Hkey,$T2 463 ___ 464 } else { 465 $code.=<<___; 466 movdqa $Xi,$Xhi # 467 pshufd \$0b01001110,$Xi,$T1 468 pxor $Xi,$T1 # 469 ___ 470 } 471 $code.=<<___; 472 pclmulqdq \$0x00,$Hkey,$Xi ####### 473 pclmulqdq \$0x11,$Hkey,$Xhi ####### 474 pclmulqdq \$0x00,$HK,$T1 ####### 475 pxor $Xi,$T1 # 476 pxor $Xhi,$T1 # 477 478 movdqa $T1,$T2 # 479 psrldq \$8,$T1 480 pslldq \$8,$T2 # 481 pxor $T1,$Xhi 482 pxor $T2,$Xi # 483 ___ 484 } 485 486 sub reduction_alg9 { # 17/11 times faster than Intel version 487 my ($Xhi,$Xi) = @_; 488 489 $code.=<<___; 490 # 1st phase 491 movdqa $Xi,$T2 # 492 movdqa $Xi,$T1 493 psllq \$5,$Xi 494 pxor $Xi,$T1 # 495 psllq \$1,$Xi 496 pxor $T1,$Xi # 497 psllq \$57,$Xi # 498 movdqa $Xi,$T1 # 499 pslldq \$8,$Xi 500 psrldq \$8,$T1 # 501 pxor $T2,$Xi 502 pxor $T1,$Xhi # 503 504 # 2nd phase 505 movdqa $Xi,$T2 506 psrlq \$1,$Xi 507 pxor $T2,$Xhi # 508 pxor $Xi,$T2 509 psrlq \$5,$Xi 510 pxor $T2,$Xi # 511 psrlq \$1,$Xi # 512 pxor $Xhi,$Xi # 513 ___ 514 } 515 517 { my ($Htbl,$Xip)=@_4args; 518 my $HK="%xmm6"; 519 520 $code.=<<___; 521 .globl gcm_init_clmul 522 .type gcm_init_clmul,\@abi-omnipotent 523 .align 16 524 gcm_init_clmul: 525 .cfi_startproc 526 .L_init_clmul: 527 ___ 528 $code.=<<___ if ($win64); 529 .LSEH_begin_gcm_init_clmul: 530 # I can't trust assembler to use specific encoding:-( 531 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 532 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 533 ___ 534 $code.=<<___; 535 movdqu ($Xip),$Hkey 536 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 537 538 # <<1 twist 539 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 540 movdqa $Hkey,$T1 541 psllq \$1,$Hkey 542 pxor $T3,$T3 # 543 psrlq \$63,$T1 544 pcmpgtd $T2,$T3 # broadcast carry bit 545 pslldq \$8,$T1 546 por $T1,$Hkey # H<<=1 547 548 # magic reduction 549 pand .L0x1c2_polynomial(%rip),$T3 550 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 551 552 # calculate H^2 553 pshufd \$0b01001110,$Hkey,$HK 554 movdqa $Hkey,$Xi 555 pxor $Hkey,$HK 556 ___ 557 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 558 &reduction_alg9 ($Xhi,$Xi); 559 $code.=<<___; 560 pshufd \$0b01001110,$Hkey,$T1 561 pshufd \$0b01001110,$Xi,$T2 562 pxor $Hkey,$T1 # Karatsuba pre-processing 563 movdqu $Hkey,0x00($Htbl) # save H 564 pxor $Xi,$T2 # Karatsuba pre-processing 565 movdqu $Xi,0x10($Htbl) # save H^2 566 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 567 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 568 ___ 569 if ($do4xaggr) { 570 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 571 &reduction_alg9 ($Xhi,$Xi); 572 $code.=<<___; 573 movdqa $Xi,$T3 574 ___ 575 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 576 &reduction_alg9 ($Xhi,$Xi); 577 $code.=<<___; 578 pshufd \$0b01001110,$T3,$T1 579 pshufd \$0b01001110,$Xi,$T2 580 pxor $T3,$T1 # Karatsuba pre-processing 581 movdqu $T3,0x30($Htbl) # save H^3 582 pxor $Xi,$T2 # Karatsuba pre-processing 583 movdqu $Xi,0x40($Htbl) # save H^4 584 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 585 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 586 ___ 587 } 588 $code.=<<___ if ($win64); 589 movaps (%rsp),%xmm6 590 lea 0x18(%rsp),%rsp 591 .LSEH_end_gcm_init_clmul: 592 ___ 593 $code.=<<___; 594 ret 595 .cfi_endproc 596 .size gcm_init_clmul,.-gcm_init_clmul 597 ___ 598 } 599 600 { my ($Xip,$Htbl)=@_4args; 601 602 $code.=<<___; 603 .globl gcm_gmult_clmul 604 .type gcm_gmult_clmul,\@abi-omnipotent 605 .align 16 606 gcm_gmult_clmul: 607 .cfi_startproc 608 .L_gmult_clmul: 609 movdqu ($Xip),$Xi 610 movdqa .Lbswap_mask(%rip),$T3 611 movdqu ($Htbl),$Hkey 612 movdqu 0x20($Htbl),$T2 613 pshufb $T3,$Xi 614 ___ 615 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 616 $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 617 # experimental alternative. special thing about is that there 618 # no dependency between the two multiplications... 619 mov \$`0xE1<<1`,%eax 620 mov \$0xA040608020C0E000,%r10 # ((7..0)0xE0)&0xff 621 mov \$0x07,%r11d 622 movq %rax,$T1 623 movq %r10,$T2 624 movq %r11,$T3 # borrow $T3 625 pand $Xi,$T3 626 pshufb $T3,$T2 # ($Xi&7)0xE0 627 movq %rax,$T3 628 pclmulqdq \$0x00,$Xi,$T1 # (0xE1<<1) 629 pxor $Xi,$T2 630 pslldq \$15,$T2 631 paddd $T2,$T2 # <<(64+56+1) 632 pxor $T2,$Xi 633 pclmulqdq \$0x01,$T3,$Xi 634 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 635 psrldq \$1,$T1 636 pxor $T1,$Xhi 637 pslldq \$7,$Xi 638 pxor $Xhi,$Xi 639 ___ 640 $code.=<<___; 641 pshufb $T3,$Xi 642 movdqu $Xi,($Xip) 643 ret 644 .cfi_endproc 645 .size gcm_gmult_clmul,.-gcm_gmult_clmul 646 ___ 647 } 648 650 { my ($Xip,$Htbl,$inp,$len)=@_4args; 651 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 652 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 653 654 $code.=<<___; 655 .globl gcm_ghash_clmul 656 .type gcm_ghash_clmul,\@abi-omnipotent 657 .align 32 658 gcm_ghash_clmul: 659 .cfi_startproc 660 .L_ghash_clmul: 661 ___ 662 $code.=<<___ if ($win64); 663 lea -0x88(%rsp),%rax 664 .LSEH_begin_gcm_ghash_clmul: 665 # I can't trust assembler to use specific encoding:-( 666 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 667 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 668 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 669 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 670 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 671 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 672 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 673 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 674 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 675 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 676 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 677 ___ 678 $code.=<<___; 679 movdqa .Lbswap_mask(%rip),$T3 680 681 movdqu ($Xip),$Xi 682 movdqu ($Htbl),$Hkey 683 movdqu 0x20($Htbl),$HK 684 pshufb $T3,$Xi 685 686 sub \$0x10,$len 687 jz .Lodd_tail 688 689 movdqu 0x10($Htbl),$Hkey2 690 ___ 691 if ($do4xaggr) { 692 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 693 694 $code.=<<___; 695 leaq OPENSSL_ia32cap_P(%rip),%rax 696 mov 4(%rax),%eax 697 cmp \$0x30,$len 698 jb .Lskip4x 699 700 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 701 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 702 je .Lskip4x 703 704 sub \$0x30,$len 705 mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff 706 movdqu 0x30($Htbl),$Hkey3 707 movdqu 0x40($Htbl),$Hkey4 708 709 ####### 710 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 711 # 712 movdqu 0x30($inp),$Xln 713 movdqu 0x20($inp),$Xl 714 pshufb $T3,$Xln 715 pshufb $T3,$Xl 716 movdqa $Xln,$Xhn 717 pshufd \$0b01001110,$Xln,$Xmn 718 pxor $Xln,$Xmn 719 pclmulqdq \$0x00,$Hkey,$Xln 720 pclmulqdq \$0x11,$Hkey,$Xhn 721 pclmulqdq \$0x00,$HK,$Xmn 722 723 movdqa $Xl,$Xh 724 pshufd \$0b01001110,$Xl,$Xm 725 pxor $Xl,$Xm 726 pclmulqdq \$0x00,$Hkey2,$Xl 727 pclmulqdq \$0x11,$Hkey2,$Xh 728 pclmulqdq \$0x10,$HK,$Xm 729 xorps $Xl,$Xln 730 xorps $Xh,$Xhn 731 movups 0x50($Htbl),$HK 732 xorps $Xm,$Xmn 733 734 movdqu 0x10($inp),$Xl 735 movdqu 0($inp),$T1 736 pshufb $T3,$Xl 737 pshufb $T3,$T1 738 movdqa $Xl,$Xh 739 pshufd \$0b01001110,$Xl,$Xm 740 pxor $T1,$Xi 741 pxor $Xl,$Xm 742 pclmulqdq \$0x00,$Hkey3,$Xl 743 movdqa $Xi,$Xhi 744 pshufd \$0b01001110,$Xi,$T1 745 pxor $Xi,$T1 746 pclmulqdq \$0x11,$Hkey3,$Xh 747 pclmulqdq \$0x00,$HK,$Xm 748 xorps $Xl,$Xln 749 xorps $Xh,$Xhn 750 751 lea 0x40($inp),$inp 752 sub \$0x40,$len 753 jc .Ltail4x 754 755 jmp .Lmod4_loop 756 .align 32 757 .Lmod4_loop: 758 pclmulqdq \$0x00,$Hkey4,$Xi 759 xorps $Xm,$Xmn 760 movdqu 0x30($inp),$Xl 761 pshufb $T3,$Xl 762 pclmulqdq \$0x11,$Hkey4,$Xhi 763 xorps $Xln,$Xi 764 movdqu 0x20($inp),$Xln 765 movdqa $Xl,$Xh 766 pclmulqdq \$0x10,$HK,$T1 767 pshufd \$0b01001110,$Xl,$Xm 768 xorps $Xhn,$Xhi 769 pxor $Xl,$Xm 770 pshufb $T3,$Xln 771 movups 0x20($Htbl),$HK 772 xorps $Xmn,$T1 773 pclmulqdq \$0x00,$Hkey,$Xl 774 pshufd \$0b01001110,$Xln,$Xmn 775 776 pxor $Xi,$T1 # aggregated Karatsuba post-processing 777 movdqa $Xln,$Xhn 778 pxor $Xhi,$T1 # 779 pxor $Xln,$Xmn 780 movdqa $T1,$T2 # 781 pclmulqdq \$0x11,$Hkey,$Xh 782 pslldq \$8,$T1 783 psrldq \$8,$T2 # 784 pxor $T1,$Xi 785 movdqa .L7_mask(%rip),$T1 786 pxor $T2,$Xhi # 787 movq %rax,$T2 788 789 pand $Xi,$T1 # 1st phase 790 pshufb $T1,$T2 # 791 pxor $Xi,$T2 # 792 pclmulqdq \$0x00,$HK,$Xm 793 psllq \$57,$T2 # 794 movdqa $T2,$T1 # 795 pslldq \$8,$T2 796 pclmulqdq \$0x00,$Hkey2,$Xln 797 psrldq \$8,$T1 # 798 pxor $T2,$Xi 799 pxor $T1,$Xhi # 800 movdqu 0($inp),$T1 801 802 movdqa $Xi,$T2 # 2nd phase 803 psrlq \$1,$Xi 804 pclmulqdq \$0x11,$Hkey2,$Xhn 805 xorps $Xl,$Xln 806 movdqu 0x10($inp),$Xl 807 pshufb $T3,$Xl 808 pclmulqdq \$0x10,$HK,$Xmn 809 xorps $Xh,$Xhn 810 movups 0x50($Htbl),$HK 811 pshufb $T3,$T1 812 pxor $T2,$Xhi # 813 pxor $Xi,$T2 814 psrlq \$5,$Xi 815 816 movdqa $Xl,$Xh 817 pxor $Xm,$Xmn 818 pshufd \$0b01001110,$Xl,$Xm 819 pxor $T2,$Xi # 820 pxor $T1,$Xhi 821 pxor $Xl,$Xm 822 pclmulqdq \$0x00,$Hkey3,$Xl 823 psrlq \$1,$Xi # 824 pxor $Xhi,$Xi # 825 movdqa $Xi,$Xhi 826 pclmulqdq \$0x11,$Hkey3,$Xh 827 xorps $Xl,$Xln 828 pshufd \$0b01001110,$Xi,$T1 829 pxor $Xi,$T1 830 831 pclmulqdq \$0x00,$HK,$Xm 832 xorps $Xh,$Xhn 833 834 lea 0x40($inp),$inp 835 sub \$0x40,$len 836 jnc .Lmod4_loop 837 838 .Ltail4x: 839 pclmulqdq \$0x00,$Hkey4,$Xi 840 pclmulqdq \$0x11,$Hkey4,$Xhi 841 pclmulqdq \$0x10,$HK,$T1 842 xorps $Xm,$Xmn 843 xorps $Xln,$Xi 844 xorps $Xhn,$Xhi 845 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 846 pxor $Xmn,$T1 847 848 pxor $Xhi,$T1 # 849 pxor $Xi,$Xhi 850 851 movdqa $T1,$T2 # 852 psrldq \$8,$T1 853 pslldq \$8,$T2 # 854 pxor $T1,$Xhi 855 pxor $T2,$Xi # 856 ___ 857 &reduction_alg9($Xhi,$Xi); 858 $code.=<<___; 859 add \$0x40,$len 860 jz .Ldone 861 movdqu 0x20($Htbl),$HK 862 sub \$0x10,$len 863 jz .Lodd_tail 864 .Lskip4x: 865 ___ 866 } 867 $code.=<<___; 868 ####### 869 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 870 # [(H*Ii+1) + (H*Xi+1)] mod P = 871 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 872 # 873 movdqu ($inp),$T1 # Ii 874 movdqu 16($inp),$Xln # Ii+1 875 pshufb $T3,$T1 876 pshufb $T3,$Xln 877 pxor $T1,$Xi # Ii+Xi 878 879 movdqa $Xln,$Xhn 880 pshufd \$0b01001110,$Xln,$Xmn 881 pxor $Xln,$Xmn 882 pclmulqdq \$0x00,$Hkey,$Xln 883 pclmulqdq \$0x11,$Hkey,$Xhn 884 pclmulqdq \$0x00,$HK,$Xmn 885 886 lea 32($inp),$inp # i+=2 887 nop 888 sub \$0x20,$len 889 jbe .Leven_tail 890 nop 891 jmp .Lmod_loop 892 893 .align 32 894 .Lmod_loop: 895 movdqa $Xi,$Xhi 896 movdqa $Xmn,$T1 897 pshufd \$0b01001110,$Xi,$Xmn # 898 pxor $Xi,$Xmn # 899 900 pclmulqdq \$0x00,$Hkey2,$Xi 901 pclmulqdq \$0x11,$Hkey2,$Xhi 902 pclmulqdq \$0x10,$HK,$Xmn 903 904 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 905 pxor $Xhn,$Xhi 906 movdqu ($inp),$T2 # Ii 907 pxor $Xi,$T1 # aggregated Karatsuba post-processing 908 pshufb $T3,$T2 909 movdqu 16($inp),$Xln # Ii+1 910 911 pxor $Xhi,$T1 912 pxor $T2,$Xhi # "Ii+Xi", consume early 913 pxor $T1,$Xmn 914 pshufb $T3,$Xln 915 movdqa $Xmn,$T1 # 916 psrldq \$8,$T1 917 pslldq \$8,$Xmn # 918 pxor $T1,$Xhi 919 pxor $Xmn,$Xi # 920 921 movdqa $Xln,$Xhn # 922 923 movdqa $Xi,$T2 # 1st phase 924 movdqa $Xi,$T1 925 psllq \$5,$Xi 926 pxor $Xi,$T1 # 927 pclmulqdq \$0x00,$Hkey,$Xln ####### 928 psllq \$1,$Xi 929 pxor $T1,$Xi # 930 psllq \$57,$Xi # 931 movdqa $Xi,$T1 # 932 pslldq \$8,$Xi 933 psrldq \$8,$T1 # 934 pxor $T2,$Xi 935 pshufd \$0b01001110,$Xhn,$Xmn 936 pxor $T1,$Xhi # 937 pxor $Xhn,$Xmn # 938 939 movdqa $Xi,$T2 # 2nd phase 940 psrlq \$1,$Xi 941 pclmulqdq \$0x11,$Hkey,$Xhn ####### 942 pxor $T2,$Xhi # 943 pxor $Xi,$T2 944 psrlq \$5,$Xi 945 pxor $T2,$Xi # 946 lea 32($inp),$inp 947 psrlq \$1,$Xi # 948 pclmulqdq \$0x00,$HK,$Xmn ####### 949 pxor $Xhi,$Xi # 950 951 sub \$0x20,$len 952 ja .Lmod_loop 953 954 .Leven_tail: 955 movdqa $Xi,$Xhi 956 movdqa $Xmn,$T1 957 pshufd \$0b01001110,$Xi,$Xmn # 958 pxor $Xi,$Xmn # 959 960 pclmulqdq \$0x00,$Hkey2,$Xi 961 pclmulqdq \$0x11,$Hkey2,$Xhi 962 pclmulqdq \$0x10,$HK,$Xmn 963 964 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 965 pxor $Xhn,$Xhi 966 pxor $Xi,$T1 967 pxor $Xhi,$T1 968 pxor $T1,$Xmn 969 movdqa $Xmn,$T1 # 970 psrldq \$8,$T1 971 pslldq \$8,$Xmn # 972 pxor $T1,$Xhi 973 pxor $Xmn,$Xi # 974 ___ 975 &reduction_alg9 ($Xhi,$Xi); 976 $code.=<<___; 977 test $len,$len 978 jnz .Ldone 979 980 .Lodd_tail: 981 movdqu ($inp),$T1 # Ii 982 pshufb $T3,$T1 983 pxor $T1,$Xi # Ii+Xi 984 ___ 985 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 986 &reduction_alg9 ($Xhi,$Xi); 987 $code.=<<___; 988 .Ldone: 989 pshufb $T3,$Xi 990 movdqu $Xi,($Xip) 991 ___ 992 $code.=<<___ if ($win64); 993 movaps (%rsp),%xmm6 994 movaps 0x10(%rsp),%xmm7 995 movaps 0x20(%rsp),%xmm8 996 movaps 0x30(%rsp),%xmm9 997 movaps 0x40(%rsp),%xmm10 998 movaps 0x50(%rsp),%xmm11 999 movaps 0x60(%rsp),%xmm12 1000 movaps 0x70(%rsp),%xmm13 1001 movaps 0x80(%rsp),%xmm14 1002 movaps 0x90(%rsp),%xmm15 1003 lea 0xa8(%rsp),%rsp 1004 .LSEH_end_gcm_ghash_clmul: 1005 ___ 1006 $code.=<<___; 1007 ret 1008 .cfi_endproc 1009 .size gcm_ghash_clmul,.-gcm_ghash_clmul 1010 ___ 1011 } 1012 1014 $code.=<<___; 1015 .globl gcm_init_avx 1016 .type gcm_init_avx,\@abi-omnipotent 1017 .align 32 1018 gcm_init_avx: 1019 .cfi_startproc 1020 ___ 1021 if ($avx) { 1022 my ($Htbl,$Xip)=@_4args; 1023 my $HK="%xmm6"; 1024 1025 $code.=<<___ if ($win64); 1026 .LSEH_begin_gcm_init_avx: 1027 # I can't trust assembler to use specific encoding:-( 1028 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 1029 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 1030 ___ 1031 $code.=<<___; 1032 vzeroupper 1033 1034 vmovdqu ($Xip),$Hkey 1035 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 1036 1037 # <<1 twist 1038 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 1039 vpsrlq \$63,$Hkey,$T1 1040 vpsllq \$1,$Hkey,$Hkey 1041 vpxor $T3,$T3,$T3 # 1042 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 1043 vpslldq \$8,$T1,$T1 1044 vpor $T1,$Hkey,$Hkey # H<<=1 1045 1046 # magic reduction 1047 vpand .L0x1c2_polynomial(%rip),$T3,$T3 1048 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 1049 1050 vpunpckhqdq $Hkey,$Hkey,$HK 1051 vmovdqa $Hkey,$Xi 1052 vpxor $Hkey,$HK,$HK 1053 mov \$4,%r10 # up to H^8 1054 jmp .Linit_start_avx 1055 ___ 1056 1057 sub clmul64x64_avx { 1058 my ($Xhi,$Xi,$Hkey,$HK)=@_; 1059 1060 if (!defined($HK)) { $HK = $T2; 1061 $code.=<<___; 1062 vpunpckhqdq $Xi,$Xi,$T1 1063 vpunpckhqdq $Hkey,$Hkey,$T2 1064 vpxor $Xi,$T1,$T1 # 1065 vpxor $Hkey,$T2,$T2 1066 ___ 1067 } else { 1068 $code.=<<___; 1069 vpunpckhqdq $Xi,$Xi,$T1 1070 vpxor $Xi,$T1,$T1 # 1071 ___ 1072 } 1073 $code.=<<___; 1074 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 1075 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 1076 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 1077 vpxor $Xi,$Xhi,$T2 # 1078 vpxor $T2,$T1,$T1 # 1079 1080 vpslldq \$8,$T1,$T2 # 1081 vpsrldq \$8,$T1,$T1 1082 vpxor $T2,$Xi,$Xi # 1083 vpxor $T1,$Xhi,$Xhi 1084 ___ 1085 } 1086 1087 sub reduction_avx { 1088 my ($Xhi,$Xi) = @_; 1089 1090 $code.=<<___; 1091 vpsllq \$57,$Xi,$T1 # 1st phase 1092 vpsllq \$62,$Xi,$T2 1093 vpxor $T1,$T2,$T2 # 1094 vpsllq \$63,$Xi,$T1 1095 vpxor $T1,$T2,$T2 # 1096 vpslldq \$8,$T2,$T1 # 1097 vpsrldq \$8,$T2,$T2 1098 vpxor $T1,$Xi,$Xi # 1099 vpxor $T2,$Xhi,$Xhi 1100 1101 vpsrlq \$1,$Xi,$T2 # 2nd phase 1102 vpxor $Xi,$Xhi,$Xhi 1103 vpxor $T2,$Xi,$Xi # 1104 vpsrlq \$5,$T2,$T2 1105 vpxor $T2,$Xi,$Xi # 1106 vpsrlq \$1,$Xi,$Xi # 1107 vpxor $Xhi,$Xi,$Xi # 1108 ___ 1109 } 1110 1111 $code.=<<___; 1112 .align 32 1113 .Linit_loop_avx: 1114 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 1115 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 1116 ___ 1117 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 1118 &reduction_avx ($Xhi,$Xi); 1119 $code.=<<___; 1120 .Linit_start_avx: 1121 vmovdqa $Xi,$T3 1122 ___ 1123 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 1124 &reduction_avx ($Xhi,$Xi); 1125 $code.=<<___; 1126 vpshufd \$0b01001110,$T3,$T1 1127 vpshufd \$0b01001110,$Xi,$T2 1128 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 1129 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 1130 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 1131 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 1132 lea 0x30($Htbl),$Htbl 1133 sub \$1,%r10 1134 jnz .Linit_loop_avx 1135 1136 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 1137 vmovdqu $T3,-0x10($Htbl) 1138 1139 vzeroupper 1140 ___ 1141 $code.=<<___ if ($win64); 1142 movaps (%rsp),%xmm6 1143 lea 0x18(%rsp),%rsp 1144 .LSEH_end_gcm_init_avx: 1145 ___ 1146 $code.=<<___; 1147 ret 1148 .cfi_endproc 1149 .size gcm_init_avx,.-gcm_init_avx 1150 ___ 1151 } else { 1152 $code.=<<___; 1153 jmp .L_init_clmul 1154 .size gcm_init_avx,.-gcm_init_avx 1155 ___ 1156 } 1157 1158 $code.=<<___; 1159 .globl gcm_gmult_avx 1160 .type gcm_gmult_avx,\@abi-omnipotent 1161 .align 32 1162 gcm_gmult_avx: 1163 .cfi_startproc 1164 jmp .L_gmult_clmul 1165 .cfi_endproc 1166 .size gcm_gmult_avx,.-gcm_gmult_avx 1167 ___ 1168 1170 $code.=<<___; 1171 .globl gcm_ghash_avx 1172 .type gcm_ghash_avx,\@abi-omnipotent 1173 .align 32 1174 gcm_ghash_avx: 1175 .cfi_startproc 1176 ___ 1177 if ($avx) { 1178 my ($Xip,$Htbl,$inp,$len)=@_4args; 1179 my ($Xlo,$Xhi,$Xmi, 1180 $Zlo,$Zhi,$Zmi, 1181 $Hkey,$HK,$T1,$T2, 1182 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 1183 1184 $code.=<<___ if ($win64); 1185 lea -0x88(%rsp),%rax 1186 .LSEH_begin_gcm_ghash_avx: 1187 # I can't trust assembler to use specific encoding:-( 1188 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1189 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 1190 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 1191 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 1192 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 1193 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 1194 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 1195 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 1196 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 1197 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 1198 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 1199 ___ 1200 $code.=<<___; 1201 vzeroupper 1202 1203 vmovdqu ($Xip),$Xi # load $Xi 1204 lea .L0x1c2_polynomial(%rip),%r10 1205 lea 0x40($Htbl),$Htbl # size optimization 1206 vmovdqu .Lbswap_mask(%rip),$bswap 1207 vpshufb $bswap,$Xi,$Xi 1208 cmp \$0x80,$len 1209 jb .Lshort_avx 1210 sub \$0x80,$len 1211 1212 vmovdqu 0x70($inp),$Ii # I[7] 1213 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1214 vpshufb $bswap,$Ii,$Ii 1215 vmovdqu 0x20-0x40($Htbl),$HK 1216 1217 vpunpckhqdq $Ii,$Ii,$T2 1218 vmovdqu 0x60($inp),$Ij # I[6] 1219 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1220 vpxor $Ii,$T2,$T2 1221 vpshufb $bswap,$Ij,$Ij 1222 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1223 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1224 vpunpckhqdq $Ij,$Ij,$T1 1225 vmovdqu 0x50($inp),$Ii # I[5] 1226 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1227 vpxor $Ij,$T1,$T1 1228 1229 vpshufb $bswap,$Ii,$Ii 1230 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1231 vpunpckhqdq $Ii,$Ii,$T2 1232 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1233 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1234 vpxor $Ii,$T2,$T2 1235 vmovdqu 0x40($inp),$Ij # I[4] 1236 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1237 vmovdqu 0x50-0x40($Htbl),$HK 1238 1239 vpshufb $bswap,$Ij,$Ij 1240 vpxor $Xlo,$Zlo,$Zlo 1241 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1242 vpxor $Xhi,$Zhi,$Zhi 1243 vpunpckhqdq $Ij,$Ij,$T1 1244 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1245 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1246 vpxor $Xmi,$Zmi,$Zmi 1247 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1248 vpxor $Ij,$T1,$T1 1249 1250 vmovdqu 0x30($inp),$Ii # I[3] 1251 vpxor $Zlo,$Xlo,$Xlo 1252 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1253 vpxor $Zhi,$Xhi,$Xhi 1254 vpshufb $bswap,$Ii,$Ii 1255 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1256 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1257 vpxor $Zmi,$Xmi,$Xmi 1258 vpunpckhqdq $Ii,$Ii,$T2 1259 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1260 vmovdqu 0x80-0x40($Htbl),$HK 1261 vpxor $Ii,$T2,$T2 1262 1263 vmovdqu 0x20($inp),$Ij # I[2] 1264 vpxor $Xlo,$Zlo,$Zlo 1265 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1266 vpxor $Xhi,$Zhi,$Zhi 1267 vpshufb $bswap,$Ij,$Ij 1268 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1269 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1270 vpxor $Xmi,$Zmi,$Zmi 1271 vpunpckhqdq $Ij,$Ij,$T1 1272 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1273 vpxor $Ij,$T1,$T1 1274 1275 vmovdqu 0x10($inp),$Ii # I[1] 1276 vpxor $Zlo,$Xlo,$Xlo 1277 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1278 vpxor $Zhi,$Xhi,$Xhi 1279 vpshufb $bswap,$Ii,$Ii 1280 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1281 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1282 vpxor $Zmi,$Xmi,$Xmi 1283 vpunpckhqdq $Ii,$Ii,$T2 1284 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1285 vmovdqu 0xb0-0x40($Htbl),$HK 1286 vpxor $Ii,$T2,$T2 1287 1288 vmovdqu ($inp),$Ij # I[0] 1289 vpxor $Xlo,$Zlo,$Zlo 1290 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1291 vpxor $Xhi,$Zhi,$Zhi 1292 vpshufb $bswap,$Ij,$Ij 1293 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1294 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1295 vpxor $Xmi,$Zmi,$Zmi 1296 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1297 1298 lea 0x80($inp),$inp 1299 cmp \$0x80,$len 1300 jb .Ltail_avx 1301 1302 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1303 sub \$0x80,$len 1304 jmp .Loop8x_avx 1305 1306 .align 32 1307 .Loop8x_avx: 1308 vpunpckhqdq $Ij,$Ij,$T1 1309 vmovdqu 0x70($inp),$Ii # I[7] 1310 vpxor $Xlo,$Zlo,$Zlo 1311 vpxor $Ij,$T1,$T1 1312 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1313 vpshufb $bswap,$Ii,$Ii 1314 vpxor $Xhi,$Zhi,$Zhi 1315 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1316 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1317 vpunpckhqdq $Ii,$Ii,$T2 1318 vpxor $Xmi,$Zmi,$Zmi 1319 vpclmulqdq \$0x00,$HK,$T1,$Tred 1320 vmovdqu 0x20-0x40($Htbl),$HK 1321 vpxor $Ii,$T2,$T2 1322 1323 vmovdqu 0x60($inp),$Ij # I[6] 1324 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1325 vpxor $Zlo,$Xi,$Xi # collect result 1326 vpshufb $bswap,$Ij,$Ij 1327 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1328 vxorps $Zhi,$Xo,$Xo 1329 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1330 vpunpckhqdq $Ij,$Ij,$T1 1331 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1332 vpxor $Zmi,$Tred,$Tred 1333 vxorps $Ij,$T1,$T1 1334 1335 vmovdqu 0x50($inp),$Ii # I[5] 1336 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1337 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1338 vpxor $Xo,$Tred,$Tred 1339 vpslldq \$8,$Tred,$T2 1340 vpxor $Xlo,$Zlo,$Zlo 1341 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1342 vpsrldq \$8,$Tred,$Tred 1343 vpxor $T2, $Xi, $Xi 1344 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1345 vpshufb $bswap,$Ii,$Ii 1346 vxorps $Tred,$Xo, $Xo 1347 vpxor $Xhi,$Zhi,$Zhi 1348 vpunpckhqdq $Ii,$Ii,$T2 1349 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1350 vmovdqu 0x50-0x40($Htbl),$HK 1351 vpxor $Ii,$T2,$T2 1352 vpxor $Xmi,$Zmi,$Zmi 1353 1354 vmovdqu 0x40($inp),$Ij # I[4] 1355 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1356 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1357 vpshufb $bswap,$Ij,$Ij 1358 vpxor $Zlo,$Xlo,$Xlo 1359 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1360 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1361 vpunpckhqdq $Ij,$Ij,$T1 1362 vpxor $Zhi,$Xhi,$Xhi 1363 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1364 vxorps $Ij,$T1,$T1 1365 vpxor $Zmi,$Xmi,$Xmi 1366 1367 vmovdqu 0x30($inp),$Ii # I[3] 1368 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1369 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1370 vpshufb $bswap,$Ii,$Ii 1371 vpxor $Xlo,$Zlo,$Zlo 1372 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1373 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1374 vpunpckhqdq $Ii,$Ii,$T2 1375 vpxor $Xhi,$Zhi,$Zhi 1376 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1377 vmovdqu 0x80-0x40($Htbl),$HK 1378 vpxor $Ii,$T2,$T2 1379 vpxor $Xmi,$Zmi,$Zmi 1380 1381 vmovdqu 0x20($inp),$Ij # I[2] 1382 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1383 vpshufb $bswap,$Ij,$Ij 1384 vpxor $Zlo,$Xlo,$Xlo 1385 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1386 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1387 vpunpckhqdq $Ij,$Ij,$T1 1388 vpxor $Zhi,$Xhi,$Xhi 1389 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1390 vpxor $Ij,$T1,$T1 1391 vpxor $Zmi,$Xmi,$Xmi 1392 vxorps $Tred,$Xi,$Xi 1393 1394 vmovdqu 0x10($inp),$Ii # I[1] 1395 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1396 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1397 vpshufb $bswap,$Ii,$Ii 1398 vpxor $Xlo,$Zlo,$Zlo 1399 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1400 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1401 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1402 vxorps $Xo,$Tred,$Tred 1403 vpunpckhqdq $Ii,$Ii,$T2 1404 vpxor $Xhi,$Zhi,$Zhi 1405 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1406 vmovdqu 0xb0-0x40($Htbl),$HK 1407 vpxor $Ii,$T2,$T2 1408 vpxor $Xmi,$Zmi,$Zmi 1409 1410 vmovdqu ($inp),$Ij # I[0] 1411 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1412 vpshufb $bswap,$Ij,$Ij 1413 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1414 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1415 vpxor $Tred,$Ij,$Ij 1416 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1417 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1418 1419 lea 0x80($inp),$inp 1420 sub \$0x80,$len 1421 jnc .Loop8x_avx 1422 1423 add \$0x80,$len 1424 jmp .Ltail_no_xor_avx 1425 1426 .align 32 1427 .Lshort_avx: 1428 vmovdqu -0x10($inp,$len),$Ii # very last word 1429 lea ($inp,$len),$inp 1430 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1431 vmovdqu 0x20-0x40($Htbl),$HK 1432 vpshufb $bswap,$Ii,$Ij 1433 1434 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1435 vmovdqa $Xhi,$Zhi # $Zhi and 1436 vmovdqa $Xmi,$Zmi # $Zmi 1437 sub \$0x10,$len 1438 jz .Ltail_avx 1439 1440 vpunpckhqdq $Ij,$Ij,$T1 1441 vpxor $Xlo,$Zlo,$Zlo 1442 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1443 vpxor $Ij,$T1,$T1 1444 vmovdqu -0x20($inp),$Ii 1445 vpxor $Xhi,$Zhi,$Zhi 1446 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1447 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1448 vpshufb $bswap,$Ii,$Ij 1449 vpxor $Xmi,$Zmi,$Zmi 1450 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1451 vpsrldq \$8,$HK,$HK 1452 sub \$0x10,$len 1453 jz .Ltail_avx 1454 1455 vpunpckhqdq $Ij,$Ij,$T1 1456 vpxor $Xlo,$Zlo,$Zlo 1457 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1458 vpxor $Ij,$T1,$T1 1459 vmovdqu -0x30($inp),$Ii 1460 vpxor $Xhi,$Zhi,$Zhi 1461 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1462 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1463 vpshufb $bswap,$Ii,$Ij 1464 vpxor $Xmi,$Zmi,$Zmi 1465 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1466 vmovdqu 0x50-0x40($Htbl),$HK 1467 sub \$0x10,$len 1468 jz .Ltail_avx 1469 1470 vpunpckhqdq $Ij,$Ij,$T1 1471 vpxor $Xlo,$Zlo,$Zlo 1472 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1473 vpxor $Ij,$T1,$T1 1474 vmovdqu -0x40($inp),$Ii 1475 vpxor $Xhi,$Zhi,$Zhi 1476 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1477 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1478 vpshufb $bswap,$Ii,$Ij 1479 vpxor $Xmi,$Zmi,$Zmi 1480 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1481 vpsrldq \$8,$HK,$HK 1482 sub \$0x10,$len 1483 jz .Ltail_avx 1484 1485 vpunpckhqdq $Ij,$Ij,$T1 1486 vpxor $Xlo,$Zlo,$Zlo 1487 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1488 vpxor $Ij,$T1,$T1 1489 vmovdqu -0x50($inp),$Ii 1490 vpxor $Xhi,$Zhi,$Zhi 1491 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1492 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1493 vpshufb $bswap,$Ii,$Ij 1494 vpxor $Xmi,$Zmi,$Zmi 1495 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1496 vmovdqu 0x80-0x40($Htbl),$HK 1497 sub \$0x10,$len 1498 jz .Ltail_avx 1499 1500 vpunpckhqdq $Ij,$Ij,$T1 1501 vpxor $Xlo,$Zlo,$Zlo 1502 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1503 vpxor $Ij,$T1,$T1 1504 vmovdqu -0x60($inp),$Ii 1505 vpxor $Xhi,$Zhi,$Zhi 1506 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1507 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1508 vpshufb $bswap,$Ii,$Ij 1509 vpxor $Xmi,$Zmi,$Zmi 1510 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1511 vpsrldq \$8,$HK,$HK 1512 sub \$0x10,$len 1513 jz .Ltail_avx 1514 1515 vpunpckhqdq $Ij,$Ij,$T1 1516 vpxor $Xlo,$Zlo,$Zlo 1517 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1518 vpxor $Ij,$T1,$T1 1519 vmovdqu -0x70($inp),$Ii 1520 vpxor $Xhi,$Zhi,$Zhi 1521 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1522 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1523 vpshufb $bswap,$Ii,$Ij 1524 vpxor $Xmi,$Zmi,$Zmi 1525 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1526 vmovq 0xb8-0x40($Htbl),$HK 1527 sub \$0x10,$len 1528 jmp .Ltail_avx 1529 1530 .align 32 1531 .Ltail_avx: 1532 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1533 .Ltail_no_xor_avx: 1534 vpunpckhqdq $Ij,$Ij,$T1 1535 vpxor $Xlo,$Zlo,$Zlo 1536 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1537 vpxor $Ij,$T1,$T1 1538 vpxor $Xhi,$Zhi,$Zhi 1539 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1540 vpxor $Xmi,$Zmi,$Zmi 1541 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1542 1543 vmovdqu (%r10),$Tred 1544 1545 vpxor $Xlo,$Zlo,$Xi 1546 vpxor $Xhi,$Zhi,$Xo 1547 vpxor $Xmi,$Zmi,$Zmi 1548 1549 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1550 vpxor $Xo, $Zmi,$Zmi 1551 vpslldq \$8, $Zmi,$T2 1552 vpsrldq \$8, $Zmi,$Zmi 1553 vpxor $T2, $Xi, $Xi 1554 vpxor $Zmi,$Xo, $Xo 1555 1556 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1557 vpalignr \$8,$Xi,$Xi,$Xi 1558 vpxor $T2,$Xi,$Xi 1559 1560 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1561 vpalignr \$8,$Xi,$Xi,$Xi 1562 vpxor $Xo,$Xi,$Xi 1563 vpxor $T2,$Xi,$Xi 1564 1565 cmp \$0,$len 1566 jne .Lshort_avx 1567 1568 vpshufb $bswap,$Xi,$Xi 1569 vmovdqu $Xi,($Xip) 1570 vzeroupper 1571 ___ 1572 $code.=<<___ if ($win64); 1573 movaps (%rsp),%xmm6 1574 movaps 0x10(%rsp),%xmm7 1575 movaps 0x20(%rsp),%xmm8 1576 movaps 0x30(%rsp),%xmm9 1577 movaps 0x40(%rsp),%xmm10 1578 movaps 0x50(%rsp),%xmm11 1579 movaps 0x60(%rsp),%xmm12 1580 movaps 0x70(%rsp),%xmm13 1581 movaps 0x80(%rsp),%xmm14 1582 movaps 0x90(%rsp),%xmm15 1583 lea 0xa8(%rsp),%rsp 1584 .LSEH_end_gcm_ghash_avx: 1585 ___ 1586 $code.=<<___; 1587 ret 1588 .cfi_endproc 1589 .size gcm_ghash_avx,.-gcm_ghash_avx 1590 ___ 1591 } else { 1592 $code.=<<___; 1593 jmp .L_ghash_clmul 1594 .size gcm_ghash_avx,.-gcm_ghash_avx 1595 ___ 1596 } 1597 1599 $code.=<<___; 1600 .align 64 1601 .Lbswap_mask: 1602 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1603 .L0x1c2_polynomial: 1604 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1605 .L7_mask: 1606 .long 7,0,7,0 1607 .L7_mask_poly: 1608 .long 7,0,`0xE1<<1`,0 1609 .align 64 1610 .type .Lrem_4bit,\@object 1611 .Lrem_4bit: 1612 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` 1613 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` 1614 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` 1615 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` 1616 .type .Lrem_8bit,\@object 1617 .Lrem_8bit: 1618 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 1619 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 1620 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 1621 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 1622 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 1623 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 1624 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 1625 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 1626 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 1627 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 1628 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 1629 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 1630 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 1631 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 1632 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 1633 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 1634 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 1635 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 1636 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 1637 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 1638 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 1639 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 1640 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 1641 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 1642 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 1643 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 1644 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 1645 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 1646 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 1647 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 1648 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 1649 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 1650 1651 .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1652 .align 64 1653 ___ 1654 1656 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1657 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 1658 if ($win64) { 1659 $rec="%rcx"; 1660 $frame="%rdx"; 1661 $context="%r8"; 1662 $disp="%r9"; 1663 1664 $code.=<<___; 1665 .extern __imp_RtlVirtualUnwind 1666 .type se_handler,\@abi-omnipotent 1667 .align 16 1668 se_handler: 1669 push %rsi 1670 push %rdi 1671 push %rbx 1672 push %rbp 1673 push %r12 1674 push %r13 1675 push %r14 1676 push %r15 1677 pushfq 1678 sub \$64,%rsp 1679 1680 mov 120($context),%rax # pull context->Rax 1681 mov 248($context),%rbx # pull context->Rip 1682 1683 mov 8($disp),%rsi # disp->ImageBase 1684 mov 56($disp),%r11 # disp->HandlerData 1685 1686 mov 0(%r11),%r10d # HandlerData[0] 1687 lea (%rsi,%r10),%r10 # prologue label 1688 cmp %r10,%rbx # context->Rip<prologue label 1689 jb .Lin_prologue 1690 1691 mov 152($context),%rax # pull context->Rsp 1692 1693 mov 4(%r11),%r10d # HandlerData[1] 1694 lea (%rsi,%r10),%r10 # epilogue label 1695 cmp %r10,%rbx # context->Rip>=epilogue label 1696 jae .Lin_prologue 1697 1698 lea 48+280(%rax),%rax # adjust "rsp" 1699 1700 mov -8(%rax),%rbx 1701 mov -16(%rax),%rbp 1702 mov -24(%rax),%r12 1703 mov -32(%rax),%r13 1704 mov -40(%rax),%r14 1705 mov -48(%rax),%r15 1706 mov %rbx,144($context) # restore context->Rbx 1707 mov %rbp,160($context) # restore context->Rbp 1708 mov %r12,216($context) # restore context->R12 1709 mov %r13,224($context) # restore context->R13 1710 mov %r14,232($context) # restore context->R14 1711 mov %r15,240($context) # restore context->R15 1712 1713 .Lin_prologue: 1714 mov 8(%rax),%rdi 1715 mov 16(%rax),%rsi 1716 mov %rax,152($context) # restore context->Rsp 1717 mov %rsi,168($context) # restore context->Rsi 1718 mov %rdi,176($context) # restore context->Rdi 1719 1720 mov 40($disp),%rdi # disp->ContextRecord 1721 mov $context,%rsi # context 1722 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1723 .long 0xa548f3fc # cld; rep movsq 1724 1725 mov $disp,%rsi 1726 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1727 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1728 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1729 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1730 mov 40(%rsi),%r10 # disp->ContextRecord 1731 lea 56(%rsi),%r11 # &disp->HandlerData 1732 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1733 mov %r10,32(%rsp) # arg5 1734 mov %r11,40(%rsp) # arg6 1735 mov %r12,48(%rsp) # arg7 1736 mov %rcx,56(%rsp) # arg8, (NULL) 1737 call *__imp_RtlVirtualUnwind(%rip) 1738 1739 mov \$1,%eax # ExceptionContinueSearch 1740 add \$64,%rsp 1741 popfq 1742 pop %r15 1743 pop %r14 1744 pop %r13 1745 pop %r12 1746 pop %rbp 1747 pop %rbx 1748 pop %rdi 1749 pop %rsi 1750 ret 1751 .size se_handler,.-se_handler 1752 1753 .section .pdata 1754 .align 4 1755 .rva .LSEH_begin_gcm_gmult_4bit 1756 .rva .LSEH_end_gcm_gmult_4bit 1757 .rva .LSEH_info_gcm_gmult_4bit 1758 1759 .rva .LSEH_begin_gcm_ghash_4bit 1760 .rva .LSEH_end_gcm_ghash_4bit 1761 .rva .LSEH_info_gcm_ghash_4bit 1762 1763 .rva .LSEH_begin_gcm_init_clmul 1764 .rva .LSEH_end_gcm_init_clmul 1765 .rva .LSEH_info_gcm_init_clmul 1766 1767 .rva .LSEH_begin_gcm_ghash_clmul 1768 .rva .LSEH_end_gcm_ghash_clmul 1769 .rva .LSEH_info_gcm_ghash_clmul 1770 ___ 1771 $code.=<<___ if ($avx); 1772 .rva .LSEH_begin_gcm_init_avx 1773 .rva .LSEH_end_gcm_init_avx 1774 .rva .LSEH_info_gcm_init_clmul 1775 1776 .rva .LSEH_begin_gcm_ghash_avx 1777 .rva .LSEH_end_gcm_ghash_avx 1778 .rva .LSEH_info_gcm_ghash_clmul 1779 ___ 1780 $code.=<<___; 1781 .section .xdata 1782 .align 8 1783 .LSEH_info_gcm_gmult_4bit: 1784 .byte 9,0,0,0 1785 .rva se_handler 1786 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData 1787 .LSEH_info_gcm_ghash_4bit: 1788 .byte 9,0,0,0 1789 .rva se_handler 1790 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData 1791 .LSEH_info_gcm_init_clmul: 1792 .byte 0x01,0x08,0x03,0x00 1793 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1794 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1795 .LSEH_info_gcm_ghash_clmul: 1796 .byte 0x01,0x33,0x16,0x00 1797 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1798 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1799 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1800 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1801 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1802 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1803 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1804 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1805 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1806 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1807 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1808 ___ 1809 } 1810 1812 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 1813 1814 print $code; 1815 1816 close STDOUT; 1817