Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 #
     11 # AES-NI-CTR+GHASH stitch.
     12 #
     13 # February 2013
     14 #
     15 # OpenSSL GCM implementation is organized in such way that its
     16 # performance is rather close to the sum of its streamed components,
     17 # in the context parallelized AES-NI CTR and modulo-scheduled
     18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
     19 # was observed to perform significantly better than the sum of the
     20 # components on contemporary CPUs, the effort was deemed impossible to
     21 # justify. This module is based on combination of Intel submissions,
     22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
     23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
     24 # pressure with notable relative improvement, achieving 1.0 cycle per
     25 # byte processed with 128-bit key on Haswell processor, 0.74 - on
     26 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
     27 # measurements for favourable packet size, one divisible by 96.
     28 # Applications using the EVP interface will observe a few percent
     29 # worse performance.]
     30 #
     31 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
     32 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
     33 
     34 $flavour = shift;
     35 $output  = shift;
     36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     37 
     38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     39 
     40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     42 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     43 die "can't locate x86_64-xlate.pl";
     44 
     45 # |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will
     46 # be computed incorrectly.
     47 #
     48 # In upstream, this is controlled by shelling out to the compiler to check
     49 # versions, but BoringSSL is intended to be used with pre-generated perlasm
     50 # output, so this isn't useful anyway.
     51 #
     52 # The upstream code uses the condition |$avx>1| even though no AVX2
     53 # instructions are used, because it assumes MOVBE is supported by the assembler
     54 # if and only if AVX2 is also supported by the assembler; see
     55 # https://marc.info/?l=openssl-dev&m=146567589526984&w=2.
     56 $avx = 2;
     57 
     58 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     59 *STDOUT=*OUT;
     60 
     61 # See the comment above regarding why the condition is ($avx>1) when there are
     62 # no AVX2 instructions being used.
     63 if ($avx>1) {{{
     64 
     65 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
     66 
     67 ($Ii,$T1,$T2,$Hkey,
     68  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
     69 
     70 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
     71 
     72 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
     73 
     74 $code=<<___;
     75 .text
     76 
     77 .type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
     78 .align	32
     79 _aesni_ctr32_ghash_6x:
     80 	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
     81 	sub		\$6,$len
     82 	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
     83 	vmovdqu		0x00-0x80($key),$rndkey
     84 	vpaddb		$T2,$T1,$inout1
     85 	vpaddb		$T2,$inout1,$inout2
     86 	vpaddb		$T2,$inout2,$inout3
     87 	vpaddb		$T2,$inout3,$inout4
     88 	vpaddb		$T2,$inout4,$inout5
     89 	vpxor		$rndkey,$T1,$inout0
     90 	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
     91 	jmp		.Loop6x
     92 
     93 .align	32
     94 .Loop6x:
     95 	add		\$`6<<24`,$counter
     96 	jc		.Lhandle_ctr32		# discard $inout[1-5]?
     97 	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
     98 	  vpaddb	$T2,$inout5,$T1		# next counter value
     99 	  vpxor		$rndkey,$inout1,$inout1
    100 	  vpxor		$rndkey,$inout2,$inout2
    101 
    102 .Lresume_ctr32:
    103 	vmovdqu		$T1,($ivp)		# save next counter value
    104 	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
    105 	  vpxor		$rndkey,$inout3,$inout3
    106 	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
    107 	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
    108 
    109 	# At this point, the current block of 96 (0x60) bytes has already been
    110 	# loaded into registers. Concurrently with processing it, we want to
    111 	# load the next 96 bytes of input for the next round. Obviously, we can
    112 	# only do this if there are at least 96 more bytes of input beyond the
    113 	# input we're currently processing, or else we'd read past the end of
    114 	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
    115 	# bytes of input beyond the 96 bytes we're already processing, and we
    116 	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
    117 	# we'll read in the next block so that it is in registers for the next
    118 	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
    119 	# the current block and then ignore what we re-read.
    120 	#
    121 	# At this point, |$in0| points to the current (already read into
    122 	# registers) block, and |$end0| points to 2*96 bytes before the end of
    123 	# the input. Thus, |$in0| > |$end0| means that we do not have the next
    124 	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
    125 	xor		%r12,%r12
    126 	cmp		$in0,$end0
    127 
    128 	  vaesenc	$T2,$inout0,$inout0
    129 	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
    130 	  vpxor		$rndkey,$inout4,$inout4
    131 	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
    132 	  vaesenc	$T2,$inout1,$inout1
    133 	  vpxor		$rndkey,$inout5,$inout5
    134 	setnc		%r12b
    135 	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
    136 	  vaesenc	$T2,$inout2,$inout2
    137 	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
    138 	neg		%r12
    139 	  vaesenc	$T2,$inout3,$inout3
    140 	 vpxor		$Z1,$Z2,$Z2
    141 	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
    142 	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
    143 	  vaesenc	$T2,$inout4,$inout4
    144 	 vpxor		$Z1,$T1,$Z0
    145 	and		\$0x60,%r12
    146 	  vmovups	0x20-0x80($key),$rndkey
    147 	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
    148 	  vaesenc	$T2,$inout5,$inout5
    149 
    150 	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
    151 	lea		($in0,%r12),$in0
    152 	  vaesenc	$rndkey,$inout0,$inout0
    153 	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
    154 	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
    155 	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
    156 	  vaesenc	$rndkey,$inout1,$inout1
    157 	movbe		0x58($in0),%r13
    158 	  vaesenc	$rndkey,$inout2,$inout2
    159 	movbe		0x50($in0),%r12
    160 	  vaesenc	$rndkey,$inout3,$inout3
    161 	mov		%r13,0x20+8(%rsp)
    162 	  vaesenc	$rndkey,$inout4,$inout4
    163 	mov		%r12,0x28+8(%rsp)
    164 	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
    165 	  vaesenc	$rndkey,$inout5,$inout5
    166 
    167 	  vmovups	0x30-0x80($key),$rndkey
    168 	 vpxor		$T1,$Z2,$Z2
    169 	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
    170 	  vaesenc	$rndkey,$inout0,$inout0
    171 	 vpxor		$T2,$Z2,$Z2
    172 	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
    173 	  vaesenc	$rndkey,$inout1,$inout1
    174 	 vpxor		$Hkey,$Z3,$Z3
    175 	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
    176 	  vaesenc	$rndkey,$inout2,$inout2
    177 	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
    178 	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
    179 	  vaesenc	$rndkey,$inout3,$inout3
    180 	  vaesenc	$rndkey,$inout4,$inout4
    181 	 vpxor		$T1,$Z0,$Z0
    182 	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
    183 	  vaesenc	$rndkey,$inout5,$inout5
    184 
    185 	  vmovups	0x40-0x80($key),$rndkey
    186 	 vpxor		$T2,$Z2,$Z2
    187 	vpclmulqdq	\$0x00,$T1,$Ii,$T2
    188 	  vaesenc	$rndkey,$inout0,$inout0
    189 	 vpxor		$Hkey,$Z2,$Z2
    190 	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
    191 	  vaesenc	$rndkey,$inout1,$inout1
    192 	movbe		0x48($in0),%r13
    193 	 vpxor		$Z1,$Z3,$Z3
    194 	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
    195 	  vaesenc	$rndkey,$inout2,$inout2
    196 	movbe		0x40($in0),%r12
    197 	vpclmulqdq	\$0x11,$T1,$Ii,$T1
    198 	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
    199 	  vaesenc	$rndkey,$inout3,$inout3
    200 	mov		%r13,0x30+8(%rsp)
    201 	  vaesenc	$rndkey,$inout4,$inout4
    202 	mov		%r12,0x38+8(%rsp)
    203 	 vpxor		$T2,$Z0,$Z0
    204 	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
    205 	  vaesenc	$rndkey,$inout5,$inout5
    206 
    207 	  vmovups	0x50-0x80($key),$rndkey
    208 	 vpxor		$Hkey,$Z2,$Z2
    209 	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
    210 	  vaesenc	$rndkey,$inout0,$inout0
    211 	 vpxor		$Z1,$Z2,$Z2
    212 	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
    213 	  vaesenc	$rndkey,$inout1,$inout1
    214 	movbe		0x38($in0),%r13
    215 	 vpxor		$T1,$Z3,$Z3
    216 	vpclmulqdq	\$0x01,$T2,$Ii,$T1
    217 	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
    218 	  vaesenc	$rndkey,$inout2,$inout2
    219 	movbe		0x30($in0),%r12
    220 	vpclmulqdq	\$0x11,$T2,$Ii,$T2
    221 	  vaesenc	$rndkey,$inout3,$inout3
    222 	mov		%r13,0x40+8(%rsp)
    223 	  vaesenc	$rndkey,$inout4,$inout4
    224 	mov		%r12,0x48+8(%rsp)
    225 	 vpxor		$Hkey,$Z0,$Z0
    226 	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
    227 	  vaesenc	$rndkey,$inout5,$inout5
    228 
    229 	  vmovups	0x60-0x80($key),$rndkey
    230 	 vpxor		$Z1,$Z2,$Z2
    231 	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
    232 	  vaesenc	$rndkey,$inout0,$inout0
    233 	 vpxor		$T1,$Z2,$Z2
    234 	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
    235 	  vaesenc	$rndkey,$inout1,$inout1
    236 	movbe		0x28($in0),%r13
    237 	 vpxor		$T2,$Z3,$Z3
    238 	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
    239 	  vaesenc	$rndkey,$inout2,$inout2
    240 	movbe		0x20($in0),%r12
    241 	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
    242 	  vaesenc	$rndkey,$inout3,$inout3
    243 	mov		%r13,0x50+8(%rsp)
    244 	  vaesenc	$rndkey,$inout4,$inout4
    245 	mov		%r12,0x58+8(%rsp)
    246 	vpxor		$Z1,$Z2,$Z2
    247 	  vaesenc	$rndkey,$inout5,$inout5
    248 	vpxor		$T1,$Z2,$Z2
    249 
    250 	  vmovups	0x70-0x80($key),$rndkey
    251 	vpslldq		\$8,$Z2,$Z1
    252 	vpxor		$T2,$Z0,$Z0
    253 	vmovdqu		0x10($const),$Hkey	# .Lpoly
    254 
    255 	  vaesenc	$rndkey,$inout0,$inout0
    256 	vpxor		$Xi,$Z3,$Z3
    257 	  vaesenc	$rndkey,$inout1,$inout1
    258 	vpxor		$Z1,$Z0,$Z0
    259 	movbe		0x18($in0),%r13
    260 	  vaesenc	$rndkey,$inout2,$inout2
    261 	movbe		0x10($in0),%r12
    262 	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
    263 	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
    264 	mov		%r13,0x60+8(%rsp)
    265 	  vaesenc	$rndkey,$inout3,$inout3
    266 	mov		%r12,0x68+8(%rsp)
    267 	  vaesenc	$rndkey,$inout4,$inout4
    268 	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
    269 	  vaesenc	$rndkey,$inout5,$inout5
    270 
    271 	  vaesenc	$T1,$inout0,$inout0
    272 	  vmovups	0x90-0x80($key),$rndkey
    273 	  vaesenc	$T1,$inout1,$inout1
    274 	vpsrldq		\$8,$Z2,$Z2
    275 	  vaesenc	$T1,$inout2,$inout2
    276 	vpxor		$Z2,$Z3,$Z3
    277 	  vaesenc	$T1,$inout3,$inout3
    278 	vpxor		$Ii,$Z0,$Z0
    279 	movbe		0x08($in0),%r13
    280 	  vaesenc	$T1,$inout4,$inout4
    281 	movbe		0x00($in0),%r12
    282 	  vaesenc	$T1,$inout5,$inout5
    283 	  vmovups	0xa0-0x80($key),$T1
    284 	  cmp		\$11,$rounds
    285 	  jb		.Lenc_tail		# 128-bit key
    286 
    287 	  vaesenc	$rndkey,$inout0,$inout0
    288 	  vaesenc	$rndkey,$inout1,$inout1
    289 	  vaesenc	$rndkey,$inout2,$inout2
    290 	  vaesenc	$rndkey,$inout3,$inout3
    291 	  vaesenc	$rndkey,$inout4,$inout4
    292 	  vaesenc	$rndkey,$inout5,$inout5
    293 
    294 	  vaesenc	$T1,$inout0,$inout0
    295 	  vaesenc	$T1,$inout1,$inout1
    296 	  vaesenc	$T1,$inout2,$inout2
    297 	  vaesenc	$T1,$inout3,$inout3
    298 	  vaesenc	$T1,$inout4,$inout4
    299 	  vmovups	0xb0-0x80($key),$rndkey
    300 	  vaesenc	$T1,$inout5,$inout5
    301 	  vmovups	0xc0-0x80($key),$T1
    302 	  je		.Lenc_tail		# 192-bit key
    303 
    304 	  vaesenc	$rndkey,$inout0,$inout0
    305 	  vaesenc	$rndkey,$inout1,$inout1
    306 	  vaesenc	$rndkey,$inout2,$inout2
    307 	  vaesenc	$rndkey,$inout3,$inout3
    308 	  vaesenc	$rndkey,$inout4,$inout4
    309 	  vaesenc	$rndkey,$inout5,$inout5
    310 
    311 	  vaesenc	$T1,$inout0,$inout0
    312 	  vaesenc	$T1,$inout1,$inout1
    313 	  vaesenc	$T1,$inout2,$inout2
    314 	  vaesenc	$T1,$inout3,$inout3
    315 	  vaesenc	$T1,$inout4,$inout4
    316 	  vmovups	0xd0-0x80($key),$rndkey
    317 	  vaesenc	$T1,$inout5,$inout5
    318 	  vmovups	0xe0-0x80($key),$T1
    319 	  jmp		.Lenc_tail		# 256-bit key
    320 
    321 .align	32
    322 .Lhandle_ctr32:
    323 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
    324 	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
    325 	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
    326 	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
    327 	  vpaddd	$Z1,$Z2,$inout2
    328 	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
    329 	  vpaddd	$Z1,$inout1,$inout3
    330 	  vpshufb	$Ii,$inout1,$inout1
    331 	  vpaddd	$Z1,$inout2,$inout4
    332 	  vpshufb	$Ii,$inout2,$inout2
    333 	  vpxor		$rndkey,$inout1,$inout1
    334 	  vpaddd	$Z1,$inout3,$inout5
    335 	  vpshufb	$Ii,$inout3,$inout3
    336 	  vpxor		$rndkey,$inout2,$inout2
    337 	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
    338 	  vpshufb	$Ii,$inout4,$inout4
    339 	  vpshufb	$Ii,$inout5,$inout5
    340 	  vpshufb	$Ii,$T1,$T1		# next counter value
    341 	jmp		.Lresume_ctr32
    342 
    343 .align	32
    344 .Lenc_tail:
    345 	  vaesenc	$rndkey,$inout0,$inout0
    346 	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
    347 	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
    348 	  vaesenc	$rndkey,$inout1,$inout1
    349 	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
    350 	  vpxor		0x00($inp),$T1,$T2
    351 	  vaesenc	$rndkey,$inout2,$inout2
    352 	  vpxor		0x10($inp),$T1,$Ii
    353 	  vaesenc	$rndkey,$inout3,$inout3
    354 	  vpxor		0x20($inp),$T1,$Z1
    355 	  vaesenc	$rndkey,$inout4,$inout4
    356 	  vpxor		0x30($inp),$T1,$Z2
    357 	  vaesenc	$rndkey,$inout5,$inout5
    358 	  vpxor		0x40($inp),$T1,$Z3
    359 	  vpxor		0x50($inp),$T1,$Hkey
    360 	  vmovdqu	($ivp),$T1		# load next counter value
    361 
    362 	  vaesenclast	$T2,$inout0,$inout0
    363 	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
    364 	  vaesenclast	$Ii,$inout1,$inout1
    365 	 vpaddb		$T2,$T1,$Ii
    366 	mov		%r13,0x70+8(%rsp)
    367 	lea		0x60($inp),$inp
    368 	  vaesenclast	$Z1,$inout2,$inout2
    369 	 vpaddb		$T2,$Ii,$Z1
    370 	mov		%r12,0x78+8(%rsp)
    371 	lea		0x60($out),$out
    372 	  vmovdqu	0x00-0x80($key),$rndkey
    373 	  vaesenclast	$Z2,$inout3,$inout3
    374 	 vpaddb		$T2,$Z1,$Z2
    375 	  vaesenclast	$Z3, $inout4,$inout4
    376 	 vpaddb		$T2,$Z2,$Z3
    377 	  vaesenclast	$Hkey,$inout5,$inout5
    378 	 vpaddb		$T2,$Z3,$Hkey
    379 
    380 	add		\$0x60,$ret
    381 	sub		\$0x6,$len
    382 	jc		.L6x_done
    383 
    384 	  vmovups	$inout0,-0x60($out)	# save output
    385 	 vpxor		$rndkey,$T1,$inout0
    386 	  vmovups	$inout1,-0x50($out)
    387 	 vmovdqa	$Ii,$inout1		# 0 latency
    388 	  vmovups	$inout2,-0x40($out)
    389 	 vmovdqa	$Z1,$inout2		# 0 latency
    390 	  vmovups	$inout3,-0x30($out)
    391 	 vmovdqa	$Z2,$inout3		# 0 latency
    392 	  vmovups	$inout4,-0x20($out)
    393 	 vmovdqa	$Z3,$inout4		# 0 latency
    394 	  vmovups	$inout5,-0x10($out)
    395 	 vmovdqa	$Hkey,$inout5		# 0 latency
    396 	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
    397 	jmp		.Loop6x
    398 
    399 .L6x_done:
    400 	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
    401 	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
    402 
    403 	ret
    404 .size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
    405 ___
    406 ######################################################################
    407 #
    408 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
    409 #		const AES_KEY *key, unsigned char iv[16],
    410 #		struct { u128 Xi,H,Htbl[9]; } *Xip);
    411 $code.=<<___;
    412 .globl	aesni_gcm_decrypt
    413 .type	aesni_gcm_decrypt,\@function,6
    414 .align	32
    415 aesni_gcm_decrypt:
    416 	xor	$ret,$ret
    417 
    418 	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
    419 	# bytes of input.
    420 	cmp	\$0x60,$len			# minimal accepted length
    421 	jb	.Lgcm_dec_abort
    422 
    423 	lea	(%rsp),%rax			# save stack pointer
    424 	push	%rbx
    425 	push	%rbp
    426 	push	%r12
    427 	push	%r13
    428 	push	%r14
    429 	push	%r15
    430 ___
    431 $code.=<<___ if ($win64);
    432 	lea	-0xa8(%rsp),%rsp
    433 	movaps	%xmm6,-0xd8(%rax)
    434 	movaps	%xmm7,-0xc8(%rax)
    435 	movaps	%xmm8,-0xb8(%rax)
    436 	movaps	%xmm9,-0xa8(%rax)
    437 	movaps	%xmm10,-0x98(%rax)
    438 	movaps	%xmm11,-0x88(%rax)
    439 	movaps	%xmm12,-0x78(%rax)
    440 	movaps	%xmm13,-0x68(%rax)
    441 	movaps	%xmm14,-0x58(%rax)
    442 	movaps	%xmm15,-0x48(%rax)
    443 .Lgcm_dec_body:
    444 ___
    445 $code.=<<___;
    446 	vzeroupper
    447 
    448 	vmovdqu		($ivp),$T1		# input counter value
    449 	add		\$-128,%rsp
    450 	mov		12($ivp),$counter
    451 	lea		.Lbswap_mask(%rip),$const
    452 	lea		-0x80($key),$in0	# borrow $in0
    453 	mov		\$0xf80,$end0		# borrow $end0
    454 	vmovdqu		($Xip),$Xi		# load Xi
    455 	and		\$-128,%rsp		# ensure stack alignment
    456 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
    457 	lea		0x80($key),$key		# size optimization
    458 	lea		0x20+0x20($Xip),$Xip	# size optimization
    459 	mov		0xf0-0x80($key),$rounds
    460 	vpshufb		$Ii,$Xi,$Xi
    461 
    462 	and		$end0,$in0
    463 	and		%rsp,$end0
    464 	sub		$in0,$end0
    465 	jc		.Ldec_no_key_aliasing
    466 	cmp		\$768,$end0
    467 	jnc		.Ldec_no_key_aliasing
    468 	sub		$end0,%rsp		# avoid aliasing with key
    469 .Ldec_no_key_aliasing:
    470 
    471 	vmovdqu		0x50($inp),$Z3		# I[5]
    472 	lea		($inp),$in0
    473 	vmovdqu		0x40($inp),$Z0
    474 
    475 	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
    476 	# bytes before the end of the input. Note, in particular, that this is
    477 	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
    478 	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
    479 	# not be near the very beginning of the address space when |$len| < 2*96
    480 	# (0xc0).
    481 	lea		-0xc0($inp,$len),$end0
    482 
    483 	vmovdqu		0x30($inp),$Z1
    484 	shr		\$4,$len
    485 	xor		$ret,$ret
    486 	vmovdqu		0x20($inp),$Z2
    487 	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
    488 	vmovdqu		0x10($inp),$T2
    489 	 vpshufb	$Ii,$Z0,$Z0
    490 	vmovdqu		($inp),$Hkey
    491 	 vpshufb	$Ii,$Z1,$Z1
    492 	vmovdqu		$Z0,0x30(%rsp)
    493 	 vpshufb	$Ii,$Z2,$Z2
    494 	vmovdqu		$Z1,0x40(%rsp)
    495 	 vpshufb	$Ii,$T2,$T2
    496 	vmovdqu		$Z2,0x50(%rsp)
    497 	 vpshufb	$Ii,$Hkey,$Hkey
    498 	vmovdqu		$T2,0x60(%rsp)
    499 	vmovdqu		$Hkey,0x70(%rsp)
    500 
    501 	call		_aesni_ctr32_ghash_6x
    502 
    503 	vmovups		$inout0,-0x60($out)	# save output
    504 	vmovups		$inout1,-0x50($out)
    505 	vmovups		$inout2,-0x40($out)
    506 	vmovups		$inout3,-0x30($out)
    507 	vmovups		$inout4,-0x20($out)
    508 	vmovups		$inout5,-0x10($out)
    509 
    510 	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
    511 	vmovdqu		$Xi,-0x40($Xip)		# output Xi
    512 
    513 	vzeroupper
    514 ___
    515 $code.=<<___ if ($win64);
    516 	movaps	-0xd8(%rax),%xmm6
    517 	movaps	-0xc8(%rax),%xmm7
    518 	movaps	-0xb8(%rax),%xmm8
    519 	movaps	-0xa8(%rax),%xmm9
    520 	movaps	-0x98(%rax),%xmm10
    521 	movaps	-0x88(%rax),%xmm11
    522 	movaps	-0x78(%rax),%xmm12
    523 	movaps	-0x68(%rax),%xmm13
    524 	movaps	-0x58(%rax),%xmm14
    525 	movaps	-0x48(%rax),%xmm15
    526 ___
    527 $code.=<<___;
    528 	mov	-48(%rax),%r15
    529 	mov	-40(%rax),%r14
    530 	mov	-32(%rax),%r13
    531 	mov	-24(%rax),%r12
    532 	mov	-16(%rax),%rbp
    533 	mov	-8(%rax),%rbx
    534 	lea	(%rax),%rsp		# restore %rsp
    535 .Lgcm_dec_abort:
    536 	mov	$ret,%rax		# return value
    537 	ret
    538 .size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
    539 ___
    540 
    541 $code.=<<___;
    542 .type	_aesni_ctr32_6x,\@abi-omnipotent
    543 .align	32
    544 _aesni_ctr32_6x:
    545 	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
    546 	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
    547 	lea		-1($rounds),%r13
    548 	vmovups		0x10-0x80($key),$rndkey
    549 	lea		0x20-0x80($key),%r12
    550 	vpxor		$Z0,$T1,$inout0
    551 	add		\$`6<<24`,$counter
    552 	jc		.Lhandle_ctr32_2
    553 	vpaddb		$T2,$T1,$inout1
    554 	vpaddb		$T2,$inout1,$inout2
    555 	vpxor		$Z0,$inout1,$inout1
    556 	vpaddb		$T2,$inout2,$inout3
    557 	vpxor		$Z0,$inout2,$inout2
    558 	vpaddb		$T2,$inout3,$inout4
    559 	vpxor		$Z0,$inout3,$inout3
    560 	vpaddb		$T2,$inout4,$inout5
    561 	vpxor		$Z0,$inout4,$inout4
    562 	vpaddb		$T2,$inout5,$T1
    563 	vpxor		$Z0,$inout5,$inout5
    564 	jmp		.Loop_ctr32
    565 
    566 .align	16
    567 .Loop_ctr32:
    568 	vaesenc		$rndkey,$inout0,$inout0
    569 	vaesenc		$rndkey,$inout1,$inout1
    570 	vaesenc		$rndkey,$inout2,$inout2
    571 	vaesenc		$rndkey,$inout3,$inout3
    572 	vaesenc		$rndkey,$inout4,$inout4
    573 	vaesenc		$rndkey,$inout5,$inout5
    574 	vmovups		(%r12),$rndkey
    575 	lea		0x10(%r12),%r12
    576 	dec		%r13d
    577 	jnz		.Loop_ctr32
    578 
    579 	vmovdqu		(%r12),$Hkey		# last round key
    580 	vaesenc		$rndkey,$inout0,$inout0
    581 	vpxor		0x00($inp),$Hkey,$Z0
    582 	vaesenc		$rndkey,$inout1,$inout1
    583 	vpxor		0x10($inp),$Hkey,$Z1
    584 	vaesenc		$rndkey,$inout2,$inout2
    585 	vpxor		0x20($inp),$Hkey,$Z2
    586 	vaesenc		$rndkey,$inout3,$inout3
    587 	vpxor		0x30($inp),$Hkey,$Xi
    588 	vaesenc		$rndkey,$inout4,$inout4
    589 	vpxor		0x40($inp),$Hkey,$T2
    590 	vaesenc		$rndkey,$inout5,$inout5
    591 	vpxor		0x50($inp),$Hkey,$Hkey
    592 	lea		0x60($inp),$inp
    593 
    594 	vaesenclast	$Z0,$inout0,$inout0
    595 	vaesenclast	$Z1,$inout1,$inout1
    596 	vaesenclast	$Z2,$inout2,$inout2
    597 	vaesenclast	$Xi,$inout3,$inout3
    598 	vaesenclast	$T2,$inout4,$inout4
    599 	vaesenclast	$Hkey,$inout5,$inout5
    600 	vmovups		$inout0,0x00($out)
    601 	vmovups		$inout1,0x10($out)
    602 	vmovups		$inout2,0x20($out)
    603 	vmovups		$inout3,0x30($out)
    604 	vmovups		$inout4,0x40($out)
    605 	vmovups		$inout5,0x50($out)
    606 	lea		0x60($out),$out
    607 
    608 	ret
    609 .align	32
    610 .Lhandle_ctr32_2:
    611 	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
    612 	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
    613 	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
    614 	vpaddd		$Z1,$Z2,$inout2
    615 	vpaddd		$Z1,$inout1,$inout3
    616 	vpshufb		$Ii,$inout1,$inout1
    617 	vpaddd		$Z1,$inout2,$inout4
    618 	vpshufb		$Ii,$inout2,$inout2
    619 	vpxor		$Z0,$inout1,$inout1
    620 	vpaddd		$Z1,$inout3,$inout5
    621 	vpshufb		$Ii,$inout3,$inout3
    622 	vpxor		$Z0,$inout2,$inout2
    623 	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
    624 	vpshufb		$Ii,$inout4,$inout4
    625 	vpxor		$Z0,$inout3,$inout3
    626 	vpshufb		$Ii,$inout5,$inout5
    627 	vpxor		$Z0,$inout4,$inout4
    628 	vpshufb		$Ii,$T1,$T1		# next counter value
    629 	vpxor		$Z0,$inout5,$inout5
    630 	jmp	.Loop_ctr32
    631 .size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
    632 
    633 .globl	aesni_gcm_encrypt
    634 .type	aesni_gcm_encrypt,\@function,6
    635 .align	32
    636 aesni_gcm_encrypt:
    637 	xor	$ret,$ret
    638 
    639 	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
    640 	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
    641 	# least 96 more bytes of input.
    642 	cmp	\$0x60*3,$len			# minimal accepted length
    643 	jb	.Lgcm_enc_abort
    644 
    645 	lea	(%rsp),%rax			# save stack pointer
    646 	push	%rbx
    647 	push	%rbp
    648 	push	%r12
    649 	push	%r13
    650 	push	%r14
    651 	push	%r15
    652 ___
    653 $code.=<<___ if ($win64);
    654 	lea	-0xa8(%rsp),%rsp
    655 	movaps	%xmm6,-0xd8(%rax)
    656 	movaps	%xmm7,-0xc8(%rax)
    657 	movaps	%xmm8,-0xb8(%rax)
    658 	movaps	%xmm9,-0xa8(%rax)
    659 	movaps	%xmm10,-0x98(%rax)
    660 	movaps	%xmm11,-0x88(%rax)
    661 	movaps	%xmm12,-0x78(%rax)
    662 	movaps	%xmm13,-0x68(%rax)
    663 	movaps	%xmm14,-0x58(%rax)
    664 	movaps	%xmm15,-0x48(%rax)
    665 .Lgcm_enc_body:
    666 ___
    667 $code.=<<___;
    668 	vzeroupper
    669 
    670 	vmovdqu		($ivp),$T1		# input counter value
    671 	add		\$-128,%rsp
    672 	mov		12($ivp),$counter
    673 	lea		.Lbswap_mask(%rip),$const
    674 	lea		-0x80($key),$in0	# borrow $in0
    675 	mov		\$0xf80,$end0		# borrow $end0
    676 	lea		0x80($key),$key		# size optimization
    677 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
    678 	and		\$-128,%rsp		# ensure stack alignment
    679 	mov		0xf0-0x80($key),$rounds
    680 
    681 	and		$end0,$in0
    682 	and		%rsp,$end0
    683 	sub		$in0,$end0
    684 	jc		.Lenc_no_key_aliasing
    685 	cmp		\$768,$end0
    686 	jnc		.Lenc_no_key_aliasing
    687 	sub		$end0,%rsp		# avoid aliasing with key
    688 .Lenc_no_key_aliasing:
    689 
    690 	lea		($out),$in0
    691 
    692 	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
    693 	# bytes before the end of the input. Note, in particular, that this is
    694 	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
    695 	# the decryption case, there's no caveat that |$out| must not be near
    696 	# the very beginning of the address space, because we know that
    697 	# |$len| >= 3*96 from the check above, and so we know
    698 	# |$out| + |$len| >= 2*96 (0xc0).
    699 	lea		-0xc0($out,$len),$end0
    700 
    701 	shr		\$4,$len
    702 
    703 	call		_aesni_ctr32_6x
    704 	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
    705 	vpshufb		$Ii,$inout1,$T2
    706 	vmovdqu		$Xi,0x70(%rsp)
    707 	vpshufb		$Ii,$inout2,$Z0
    708 	vmovdqu		$T2,0x60(%rsp)
    709 	vpshufb		$Ii,$inout3,$Z1
    710 	vmovdqu		$Z0,0x50(%rsp)
    711 	vpshufb		$Ii,$inout4,$Z2
    712 	vmovdqu		$Z1,0x40(%rsp)
    713 	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
    714 	vmovdqu		$Z2,0x30(%rsp)
    715 
    716 	call		_aesni_ctr32_6x
    717 
    718 	vmovdqu		($Xip),$Xi		# load Xi
    719 	lea		0x20+0x20($Xip),$Xip	# size optimization
    720 	sub		\$12,$len
    721 	mov		\$0x60*2,$ret
    722 	vpshufb		$Ii,$Xi,$Xi
    723 
    724 	call		_aesni_ctr32_ghash_6x
    725 	vmovdqu		0x20(%rsp),$Z3		# I[5]
    726 	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
    727 	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
    728 	vpunpckhqdq	$Z3,$Z3,$T1
    729 	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
    730 	 vmovups	$inout0,-0x60($out)	# save output
    731 	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
    732 	vpxor		$Z3,$T1,$T1
    733 	 vmovups	$inout1,-0x50($out)
    734 	 vpshufb	$Ii,$inout1,$inout1
    735 	 vmovups	$inout2,-0x40($out)
    736 	 vpshufb	$Ii,$inout2,$inout2
    737 	 vmovups	$inout3,-0x30($out)
    738 	 vpshufb	$Ii,$inout3,$inout3
    739 	 vmovups	$inout4,-0x20($out)
    740 	 vpshufb	$Ii,$inout4,$inout4
    741 	 vmovups	$inout5,-0x10($out)
    742 	 vpshufb	$Ii,$inout5,$inout5
    743 	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
    744 ___
    745 { my ($HK,$T3)=($rndkey,$inout0);
    746 
    747 $code.=<<___;
    748 	 vmovdqu	0x30(%rsp),$Z2		# I[4]
    749 	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
    750 	 vpunpckhqdq	$Z2,$Z2,$T2
    751 	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
    752 	 vpxor		$Z2,$T2,$T2
    753 	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
    754 	vpclmulqdq	\$0x00,$HK,$T1,$T1
    755 
    756 	 vmovdqu	0x40(%rsp),$T3		# I[3]
    757 	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
    758 	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
    759 	vpxor		$Z1,$Z0,$Z0
    760 	 vpunpckhqdq	$T3,$T3,$Z1
    761 	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
    762 	 vpxor		$T3,$Z1,$Z1
    763 	vpxor		$Z3,$Z2,$Z2
    764 	vpclmulqdq	\$0x10,$HK,$T2,$T2
    765 	 vmovdqu	0x50-0x20($Xip),$HK
    766 	vpxor		$T1,$T2,$T2
    767 
    768 	 vmovdqu	0x50(%rsp),$T1		# I[2]
    769 	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
    770 	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
    771 	vpxor		$Z0,$Z3,$Z3
    772 	 vpunpckhqdq	$T1,$T1,$Z0
    773 	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
    774 	 vpxor		$T1,$Z0,$Z0
    775 	vpxor		$Z2,$T3,$T3
    776 	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
    777 	vpxor		$T2,$Z1,$Z1
    778 
    779 	 vmovdqu	0x60(%rsp),$T2		# I[1]
    780 	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
    781 	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
    782 	vpxor		$Z3,$Z2,$Z2
    783 	 vpunpckhqdq	$T2,$T2,$Z3
    784 	vpclmulqdq	\$0x11,$Ii,$T1,$T1
    785 	 vpxor		$T2,$Z3,$Z3
    786 	vpxor		$T3,$T1,$T1
    787 	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
    788 	 vmovdqu	0x80-0x20($Xip),$HK
    789 	vpxor		$Z1,$Z0,$Z0
    790 
    791 	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
    792 	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
    793 	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
    794 	 vpunpckhqdq	$Xi,$Xi,$T3
    795 	vpxor		$Z2,$Z1,$Z1
    796 	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
    797 	 vpxor		$Xi,$T3,$T3
    798 	vpxor		$T1,$T2,$T2
    799 	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
    800 	vpxor		$Z0,$Z3,$Z0
    801 
    802 	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
    803 	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
    804 	 vpunpckhqdq	$inout5,$inout5,$T1
    805 	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
    806 	 vpxor		$inout5,$T1,$T1
    807 	vpxor		$Z1,$Z2,$Z1
    808 	vpclmulqdq	\$0x10,$HK,$T3,$T3
    809 	 vmovdqu	0x20-0x20($Xip),$HK
    810 	vpxor		$T2,$Xi,$Z3
    811 	vpxor		$Z0,$T3,$Z2
    812 
    813 	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
    814 	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
    815 	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
    816 	  vpxor		$T3,$Z2,$Z2
    817 	 vpunpckhqdq	$inout4,$inout4,$T2
    818 	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
    819 	 vpxor		$inout4,$T2,$T2
    820 	  vpslldq	\$8,$Z2,$T3
    821 	vpclmulqdq	\$0x00,$HK,$T1,$T1
    822 	  vpxor		$T3,$Z1,$Xi
    823 	  vpsrldq	\$8,$Z2,$Z2
    824 	  vpxor		$Z2,$Z3,$Z3
    825 
    826 	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
    827 	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
    828 	vpxor		$Z0,$Z1,$Z1
    829 	 vpunpckhqdq	$inout3,$inout3,$T3
    830 	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
    831 	 vpxor		$inout3,$T3,$T3
    832 	vpxor		$inout5,$inout4,$inout4
    833 	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
    834 	vpclmulqdq	\$0x10,$HK,$T2,$T2
    835 	 vmovdqu	0x50-0x20($Xip),$HK
    836 	vpxor		$T1,$T2,$T2
    837 
    838 	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
    839 	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
    840 	vpxor		$Z1,$Z0,$Z0
    841 	 vpunpckhqdq	$inout2,$inout2,$T1
    842 	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
    843 	 vpxor		$inout2,$T1,$T1
    844 	vpxor		$inout4,$inout3,$inout3
    845 	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
    846 	vpclmulqdq	\$0x00,$HK,$T3,$T3
    847 	vpxor		$T2,$T3,$T3
    848 
    849 	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
    850 	  vxorps	$inout5,$Xi,$Xi
    851 
    852 	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
    853 	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
    854 	vpxor		$Z0,$Z1,$Z1
    855 	 vpunpckhqdq	$inout1,$inout1,$T2
    856 	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
    857 	 vpxor		$inout1,$T2,$T2
    858 	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
    859 	vpxor		$inout3,$inout2,$inout2
    860 	vpclmulqdq	\$0x10,$HK,$T1,$T1
    861 	 vmovdqu	0x80-0x20($Xip),$HK
    862 	vpxor		$T3,$T1,$T1
    863 
    864 	  vxorps	$Z3,$inout5,$inout5
    865 	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
    866 	  vxorps	$inout5,$Xi,$Xi
    867 
    868 	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
    869 	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
    870 	vpxor		$Z1,$Z0,$Z0
    871 	 vpunpckhqdq	$Xi,$Xi,$T3
    872 	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
    873 	 vpxor		$Xi,$T3,$T3
    874 	vpxor		$inout2,$inout1,$inout1
    875 	vpclmulqdq	\$0x00,$HK,$T2,$T2
    876 	vpxor		$T1,$T2,$T2
    877 
    878 	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
    879 	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
    880 	vpxor		$Z0,$Z1,$Z1
    881 	vpclmulqdq	\$0x10,$HK,$T3,$Z2
    882 	vpxor		$inout1,$Z3,$Z3
    883 	vpxor		$T2,$Z2,$Z2
    884 
    885 	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
    886 	vpxor		$Z0,$Z2,$Z2
    887 	vpslldq		\$8,$Z2,$T1
    888 	vmovdqu		0x10($const),$Hkey	# .Lpoly
    889 	vpsrldq		\$8,$Z2,$Z2
    890 	vpxor		$T1,$Z1,$Xi
    891 	vpxor		$Z2,$Z3,$Z3
    892 
    893 	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
    894 	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
    895 	vpxor		$T2,$Xi,$Xi
    896 
    897 	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
    898 	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
    899 	vpxor		$Z3,$T2,$T2
    900 	vpxor		$T2,$Xi,$Xi
    901 ___
    902 }
    903 $code.=<<___;
    904 	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
    905 	vmovdqu		$Xi,-0x40($Xip)		# output Xi
    906 
    907 	vzeroupper
    908 ___
    909 $code.=<<___ if ($win64);
    910 	movaps	-0xd8(%rax),%xmm6
    911 	movaps	-0xc8(%rax),%xmm7
    912 	movaps	-0xb8(%rax),%xmm8
    913 	movaps	-0xa8(%rax),%xmm9
    914 	movaps	-0x98(%rax),%xmm10
    915 	movaps	-0x88(%rax),%xmm11
    916 	movaps	-0x78(%rax),%xmm12
    917 	movaps	-0x68(%rax),%xmm13
    918 	movaps	-0x58(%rax),%xmm14
    919 	movaps	-0x48(%rax),%xmm15
    920 ___
    921 $code.=<<___;
    922 	mov	-48(%rax),%r15
    923 	mov	-40(%rax),%r14
    924 	mov	-32(%rax),%r13
    925 	mov	-24(%rax),%r12
    926 	mov	-16(%rax),%rbp
    927 	mov	-8(%rax),%rbx
    928 	lea	(%rax),%rsp		# restore %rsp
    929 .Lgcm_enc_abort:
    930 	mov	$ret,%rax		# return value
    931 	ret
    932 .size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
    933 ___
    934 
    935 $code.=<<___;
    936 .align	64
    937 .Lbswap_mask:
    938 	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
    939 .Lpoly:
    940 	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
    941 .Lone_msb:
    942 	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
    943 .Ltwo_lsb:
    944 	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    945 .Lone_lsb:
    946 	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    947 .asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    948 .align	64
    949 ___
    950 if ($win64) {
    951 $rec="%rcx";
    952 $frame="%rdx";
    953 $context="%r8";
    954 $disp="%r9";
    955 
    956 $code.=<<___
    957 .extern	__imp_RtlVirtualUnwind
    958 .type	gcm_se_handler,\@abi-omnipotent
    959 .align	16
    960 gcm_se_handler:
    961 	push	%rsi
    962 	push	%rdi
    963 	push	%rbx
    964 	push	%rbp
    965 	push	%r12
    966 	push	%r13
    967 	push	%r14
    968 	push	%r15
    969 	pushfq
    970 	sub	\$64,%rsp
    971 
    972 	mov	120($context),%rax	# pull context->Rax
    973 	mov	248($context),%rbx	# pull context->Rip
    974 
    975 	mov	8($disp),%rsi		# disp->ImageBase
    976 	mov	56($disp),%r11		# disp->HandlerData
    977 
    978 	mov	0(%r11),%r10d		# HandlerData[0]
    979 	lea	(%rsi,%r10),%r10	# prologue label
    980 	cmp	%r10,%rbx		# context->Rip<prologue label
    981 	jb	.Lcommon_seh_tail
    982 
    983 	mov	152($context),%rax	# pull context->Rsp
    984 
    985 	mov	4(%r11),%r10d		# HandlerData[1]
    986 	lea	(%rsi,%r10),%r10	# epilogue label
    987 	cmp	%r10,%rbx		# context->Rip>=epilogue label
    988 	jae	.Lcommon_seh_tail
    989 
    990 	mov	120($context),%rax	# pull context->Rax
    991 
    992 	mov	-48(%rax),%r15
    993 	mov	-40(%rax),%r14
    994 	mov	-32(%rax),%r13
    995 	mov	-24(%rax),%r12
    996 	mov	-16(%rax),%rbp
    997 	mov	-8(%rax),%rbx
    998 	mov	%r15,240($context)
    999 	mov	%r14,232($context)
   1000 	mov	%r13,224($context)
   1001 	mov	%r12,216($context)
   1002 	mov	%rbp,160($context)
   1003 	mov	%rbx,144($context)
   1004 
   1005 	lea	-0xd8(%rax),%rsi	# %xmm save area
   1006 	lea	512($context),%rdi	# & context.Xmm6
   1007 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
   1008 	.long	0xa548f3fc		# cld; rep movsq
   1009 
   1010 .Lcommon_seh_tail:
   1011 	mov	8(%rax),%rdi
   1012 	mov	16(%rax),%rsi
   1013 	mov	%rax,152($context)	# restore context->Rsp
   1014 	mov	%rsi,168($context)	# restore context->Rsi
   1015 	mov	%rdi,176($context)	# restore context->Rdi
   1016 
   1017 	mov	40($disp),%rdi		# disp->ContextRecord
   1018 	mov	$context,%rsi		# context
   1019 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1020 	.long	0xa548f3fc		# cld; rep movsq
   1021 
   1022 	mov	$disp,%rsi
   1023 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1024 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1025 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1026 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1027 	mov	40(%rsi),%r10		# disp->ContextRecord
   1028 	lea	56(%rsi),%r11		# &disp->HandlerData
   1029 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1030 	mov	%r10,32(%rsp)		# arg5
   1031 	mov	%r11,40(%rsp)		# arg6
   1032 	mov	%r12,48(%rsp)		# arg7
   1033 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1034 	call	*__imp_RtlVirtualUnwind(%rip)
   1035 
   1036 	mov	\$1,%eax		# ExceptionContinueSearch
   1037 	add	\$64,%rsp
   1038 	popfq
   1039 	pop	%r15
   1040 	pop	%r14
   1041 	pop	%r13
   1042 	pop	%r12
   1043 	pop	%rbp
   1044 	pop	%rbx
   1045 	pop	%rdi
   1046 	pop	%rsi
   1047 	ret
   1048 .size	gcm_se_handler,.-gcm_se_handler
   1049 
   1050 .section	.pdata
   1051 .align	4
   1052 	.rva	.LSEH_begin_aesni_gcm_decrypt
   1053 	.rva	.LSEH_end_aesni_gcm_decrypt
   1054 	.rva	.LSEH_gcm_dec_info
   1055 
   1056 	.rva	.LSEH_begin_aesni_gcm_encrypt
   1057 	.rva	.LSEH_end_aesni_gcm_encrypt
   1058 	.rva	.LSEH_gcm_enc_info
   1059 .section	.xdata
   1060 .align	8
   1061 .LSEH_gcm_dec_info:
   1062 	.byte	9,0,0,0
   1063 	.rva	gcm_se_handler
   1064 	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
   1065 .LSEH_gcm_enc_info:
   1066 	.byte	9,0,0,0
   1067 	.rva	gcm_se_handler
   1068 	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
   1069 ___
   1070 }
   1071 }}} else {{{
   1072 $code=<<___;	# assembler is too old
   1073 .text
   1074 
   1075 .globl	aesni_gcm_encrypt
   1076 .type	aesni_gcm_encrypt,\@abi-omnipotent
   1077 aesni_gcm_encrypt:
   1078 	xor	%eax,%eax
   1079 	ret
   1080 .size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
   1081 
   1082 .globl	aesni_gcm_decrypt
   1083 .type	aesni_gcm_decrypt,\@abi-omnipotent
   1084 aesni_gcm_decrypt:
   1085 	xor	%eax,%eax
   1086 	ret
   1087 .size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
   1088 ___
   1089 }}}
   1090 
   1091 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   1092 
   1093 print $code;
   1094 
   1095 close STDOUT;
   1096