Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 ######################################################################
     11 ## Constant-time SSSE3 AES core implementation.
     12 ## version 0.1
     13 ##
     14 ## By Mike Hamburg (Stanford University), 2009
     15 ## Public domain.
     16 ##
     17 ## For details see http://shiftleft.org/papers/vector_aes/ and
     18 ## http://crypto.stanford.edu/vpaes/.
     19 
     20 ######################################################################
     21 # September 2011.
     22 #
     23 # Interface to OpenSSL as "almost" drop-in replacement for
     24 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
     25 # doesn't handle partial vectors (doesn't have to if called from
     26 # EVP only). "Drop-in" implies that this module doesn't share key
     27 # schedule structure with the original nor does it make assumption
     28 # about its alignment...
     29 #
     30 # Performance summary. aes-x86_64.pl column lists large-block CBC
     31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
     32 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
     33 # [also large-block CBC] encrypt/decrypt.
     34 #
     35 #		aes-x86_64.pl		vpaes-x86_64.pl
     36 #
     37 # Core 2(**)	29.6/41.1/14.3		21.9/25.2(***)
     38 # Nehalem	29.6/40.3/14.6		10.0/11.8
     39 # Atom		57.3/74.2/32.1		60.9/77.2(***)
     40 # Silvermont	52.7/64.0/19.5		48.8/60.8(***)
     41 # Goldmont	38.9/49.0/17.8		10.6/12.6
     42 #
     43 # (*)	"Hyper-threading" in the context refers rather to cache shared
     44 #	among multiple cores, than to specifically Intel HTT. As vast
     45 #	majority of contemporary cores share cache, slower code path
     46 #	is common place. In other words "with-hyper-threading-off"
     47 #	results are presented mostly for reference purposes.
     48 #
     49 # (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
     50 #
     51 # (***)	Less impressive improvement on Core 2 and Atom is due to slow
     52 #	pshufb,	yet it's respectable +36%/62% improvement on Core 2
     53 #	(as implied, over "hyper-threading-safe" code path).
     54 #
     55 #						<appro (at] openssl.org>
     56 
     57 $flavour = shift;
     58 $output  = shift;
     59 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     60 
     61 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     62 
     63 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     64 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     65 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     66 die "can't locate x86_64-xlate.pl";
     67 
     68 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     69 *STDOUT=*OUT;
     70 
     71 $PREFIX="vpaes";
     72 
     73 $code.=<<___;
     74 .text
     75 
     76 ##
     77 ##  _aes_encrypt_core
     78 ##
     79 ##  AES-encrypt %xmm0.
     80 ##
     81 ##  Inputs:
     82 ##     %xmm0 = input
     83 ##     %xmm9-%xmm15 as in _vpaes_preheat
     84 ##    (%rdx) = scheduled keys
     85 ##
     86 ##  Output in %xmm0
     87 ##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
     88 ##  Preserves %xmm6 - %xmm8 so you get some local vectors
     89 ##
     90 ##
     91 .type	_vpaes_encrypt_core,\@abi-omnipotent
     92 .align 16
     93 _vpaes_encrypt_core:
     94 .cfi_startproc
     95 	mov	%rdx,	%r9
     96 	mov	\$16,	%r11
     97 	mov	240(%rdx),%eax
     98 	movdqa	%xmm9,	%xmm1
     99 	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
    100 	pandn	%xmm0,	%xmm1
    101 	movdqu	(%r9),	%xmm5		# round0 key
    102 	psrld	\$4,	%xmm1
    103 	pand	%xmm9,	%xmm0
    104 	pshufb	%xmm0,	%xmm2
    105 	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
    106 	pshufb	%xmm1,	%xmm0
    107 	pxor	%xmm5,	%xmm2
    108 	add	\$16,	%r9
    109 	pxor	%xmm2,	%xmm0
    110 	lea	.Lk_mc_backward(%rip),%r10
    111 	jmp	.Lenc_entry
    112 
    113 .align 16
    114 .Lenc_loop:
    115 	# middle of middle round
    116 	movdqa  %xmm13,	%xmm4	# 4 : sb1u
    117 	movdqa  %xmm12,	%xmm0	# 0 : sb1t
    118 	pshufb  %xmm2,	%xmm4	# 4 = sb1u
    119 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
    120 	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
    121 	movdqa  %xmm15,	%xmm5	# 4 : sb2u
    122 	pxor	%xmm4,	%xmm0	# 0 = A
    123 	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
    124 	pshufb	%xmm2,	%xmm5	# 4 = sb2u
    125 	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
    126 	movdqa	%xmm14, %xmm2	# 2 : sb2t
    127 	pshufb	%xmm3,  %xmm2	# 2 = sb2t
    128 	movdqa	%xmm0,  %xmm3	# 3 = A
    129 	pxor	%xmm5,	%xmm2	# 2 = 2A
    130 	pshufb  %xmm1,  %xmm0	# 0 = B
    131 	add	\$16,	%r9	# next key
    132 	pxor	%xmm2,  %xmm0	# 0 = 2A+B
    133 	pshufb	%xmm4,	%xmm3	# 3 = D
    134 	add	\$16,	%r11	# next mc
    135 	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
    136 	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
    137 	and	\$0x30,	%r11	# ... mod 4
    138 	sub	\$1,%rax	# nr--
    139 	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
    140 
    141 .Lenc_entry:
    142 	# top of round
    143 	movdqa  %xmm9, 	%xmm1	# 1 : i
    144 	movdqa	%xmm11, %xmm5	# 2 : a/k
    145 	pandn	%xmm0, 	%xmm1	# 1 = i<<4
    146 	psrld	\$4,   	%xmm1   # 1 = i
    147 	pand	%xmm9, 	%xmm0   # 0 = k
    148 	pshufb  %xmm0,  %xmm5	# 2 = a/k
    149 	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
    150 	pxor	%xmm1,	%xmm0	# 0 = j
    151 	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
    152 	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
    153 	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
    154 	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
    155 	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
    156 	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
    157 	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
    158 	movdqa	%xmm10, %xmm3   # 3 : 1/jak
    159 	pxor	%xmm0, 	%xmm2  	# 2 = io
    160 	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
    161 	movdqu	(%r9),	%xmm5
    162 	pxor	%xmm1,  %xmm3   # 3 = jo
    163 	jnz	.Lenc_loop
    164 
    165 	# middle of last round
    166 	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
    167 	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
    168 	pshufb  %xmm2,  %xmm4	# 4 = sbou
    169 	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
    170 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
    171 	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
    172 	pxor	%xmm4,	%xmm0	# 0 = A
    173 	pshufb	%xmm1,	%xmm0
    174 	ret
    175 .cfi_endproc
    176 .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
    177 
    178 ##
    179 ##  _aes_encrypt_core_2x
    180 ##
    181 ##  AES-encrypt %xmm0 and %xmm6 in parallel.
    182 ##
    183 ##  Inputs:
    184 ##     %xmm0 and %xmm6 = input
    185 ##     %xmm12-%xmm15 as in _vpaes_preheat
    186 ##    (%rdx) = scheduled keys
    187 ##
    188 ##  Output in %xmm0 and %xmm6
    189 ##  Clobbers  %xmm1-%xmm5, %xmm7-%xmm11, %r9, %r10, %r11, %rax
    190 ##  Preserves %xmm14 and %xmm15
    191 ##
    192 ##  This function stitches two parallel instances of _vpaes_encrypt_core. x86_64
    193 ##  provides 16 XMM registers. _vpaes_encrypt_core computes over six registers
    194 ##  (%xmm0-%xmm5) and additionally uses seven registers with preloaded constants
    195 ##  from _vpaes_preheat (%xmm9-%xmm15). This does not quite fit two instances,
    196 ##  so we spill some of %xmm9 through %xmm15 back to memory. We keep %xmm9 and
    197 ##  %xmm10 in registers as these values are used several times in a row. The
    198 ##  remainder are read once per round and are spilled to memory. This leaves two
    199 ##  registers preserved for the caller.
    200 ##
    201 ##  Thus, of the two _vpaes_encrypt_core instances, the first uses (%xmm0-%xmm5)
    202 ##  as before. The second uses %xmm6-%xmm8,%xmm11-%xmm13. (Add 6 to %xmm2 and
    203 ##  below. Add 8 to %xmm3 and up.) Instructions in the second instance are
    204 ##  indented by one space.
    205 ##
    206 ##
    207 .type	_vpaes_encrypt_core_2x,\@abi-omnipotent
    208 .align 16
    209 _vpaes_encrypt_core_2x:
    210 .cfi_startproc
    211 	mov	%rdx,	%r9
    212 	mov	\$16,	%r11
    213 	mov	240(%rdx),%eax
    214 	movdqa	%xmm9,	%xmm1
    215 	 movdqa	%xmm9,	%xmm7
    216 	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
    217 	 movdqa	%xmm2,	%xmm8
    218 	pandn	%xmm0,	%xmm1
    219 	 pandn	%xmm6,	%xmm7
    220 	movdqu	(%r9),	%xmm5		# round0 key
    221 	 # Also use %xmm5 in the second instance.
    222 	psrld	\$4,	%xmm1
    223 	 psrld	\$4,	%xmm7
    224 	pand	%xmm9,	%xmm0
    225 	 pand	%xmm9,	%xmm6
    226 	pshufb	%xmm0,	%xmm2
    227 	 pshufb	%xmm6,	%xmm8
    228 	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
    229 	 movdqa	%xmm0,	%xmm6
    230 	pshufb	%xmm1,	%xmm0
    231 	 pshufb	%xmm7,	%xmm6
    232 	pxor	%xmm5,	%xmm2
    233 	 pxor	%xmm5,	%xmm8
    234 	add	\$16,	%r9
    235 	pxor	%xmm2,	%xmm0
    236 	 pxor	%xmm8,	%xmm6
    237 	lea	.Lk_mc_backward(%rip),%r10
    238 	jmp	.Lenc2x_entry
    239 
    240 .align 16
    241 .Lenc2x_loop:
    242 	# middle of middle round
    243 	movdqa  .Lk_sb1(%rip),	%xmm4		# 4 : sb1u
    244 	movdqa  .Lk_sb1+16(%rip),%xmm0		# 0 : sb1t
    245 	 movdqa	%xmm4,	%xmm12
    246 	 movdqa	%xmm0,	%xmm6
    247 	pshufb  %xmm2,	%xmm4			# 4 = sb1u
    248 	 pshufb	%xmm8,	%xmm12
    249 	pshufb  %xmm3,	%xmm0			# 0 = sb1t
    250 	 pshufb	%xmm11,	%xmm6
    251 	pxor	%xmm5,	%xmm4			# 4 = sb1u + k
    252 	 pxor	%xmm5,	%xmm12
    253 	movdqa  .Lk_sb2(%rip),	%xmm5		# 4 : sb2u
    254 	 movdqa	%xmm5,	%xmm13
    255 	pxor	%xmm4,	%xmm0			# 0 = A
    256 	 pxor	%xmm12,	%xmm6
    257 	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
    258 	 # Also use %xmm1 in the second instance.
    259 	pshufb	%xmm2,	%xmm5			# 4 = sb2u
    260 	 pshufb	%xmm8,	%xmm13
    261 	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
    262 	 # Also use %xmm4 in the second instance.
    263 	movdqa	.Lk_sb2+16(%rip), %xmm2		# 2 : sb2t
    264 	 movdqa	%xmm2,	%xmm8
    265 	pshufb	%xmm3,  %xmm2			# 2 = sb2t
    266 	 pshufb	%xmm11,	%xmm8
    267 	movdqa	%xmm0,  %xmm3			# 3 = A
    268 	 movdqa	%xmm6,	%xmm11
    269 	pxor	%xmm5,	%xmm2			# 2 = 2A
    270 	 pxor	%xmm13,	%xmm8
    271 	pshufb  %xmm1,  %xmm0			# 0 = B
    272 	 pshufb	%xmm1,	%xmm6
    273 	add	\$16,	%r9			# next key
    274 	pxor	%xmm2,  %xmm0			# 0 = 2A+B
    275 	 pxor	%xmm8,	%xmm6
    276 	pshufb	%xmm4,	%xmm3			# 3 = D
    277 	 pshufb	%xmm4,	%xmm11
    278 	add	\$16,	%r11			# next mc
    279 	pxor	%xmm0,	%xmm3			# 3 = 2A+B+D
    280 	 pxor	%xmm6,	%xmm11
    281 	pshufb  %xmm1,	%xmm0			# 0 = 2B+C
    282 	 pshufb	%xmm1,	%xmm6
    283 	and	\$0x30,	%r11			# ... mod 4
    284 	sub	\$1,%rax			# nr--
    285 	pxor	%xmm3,	%xmm0			# 0 = 2A+3B+C+D
    286 	 pxor	%xmm11,	%xmm6
    287 
    288 .Lenc2x_entry:
    289 	# top of round
    290 	movdqa  %xmm9, 	%xmm1	# 1 : i
    291 	 movdqa	%xmm9,	%xmm7
    292 	movdqa	.Lk_inv+16(%rip), %xmm5	# 2 : a/k
    293 	 movdqa	%xmm5,	%xmm13
    294 	pandn	%xmm0, 	%xmm1	# 1 = i<<4
    295 	 pandn	%xmm6,	%xmm7
    296 	psrld	\$4,   	%xmm1   # 1 = i
    297 	 psrld	\$4,	%xmm7
    298 	pand	%xmm9, 	%xmm0   # 0 = k
    299 	 pand	%xmm9,	%xmm6
    300 	pshufb  %xmm0,  %xmm5	# 2 = a/k
    301 	 pshufb	%xmm6,	%xmm13
    302 	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
    303 	 movdqa	%xmm10,	%xmm11
    304 	pxor	%xmm1,	%xmm0	# 0 = j
    305 	 pxor	%xmm7,	%xmm6
    306 	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
    307 	 pshufb	%xmm7,	%xmm11
    308 	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
    309 	 movdqa	%xmm10,	%xmm12
    310 	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
    311 	 pxor	%xmm13,	%xmm11
    312 	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
    313 	 pshufb	%xmm6,	%xmm12
    314 	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
    315 	 movdqa	%xmm10,	%xmm8
    316 	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
    317 	 pxor	%xmm13,	%xmm12
    318 	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
    319 	 pshufb	%xmm11,	%xmm8
    320 	movdqa	%xmm10, %xmm3   # 3 : 1/jak
    321 	 movdqa	%xmm10,	%xmm11
    322 	pxor	%xmm0, 	%xmm2  	# 2 = io
    323 	 pxor	%xmm6,	%xmm8
    324 	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
    325 	 pshufb	%xmm12,	%xmm11
    326 	movdqu	(%r9),	%xmm5
    327 	 # Also use %xmm5 in the second instance.
    328 	pxor	%xmm1,  %xmm3   # 3 = jo
    329 	 pxor	%xmm7,	%xmm11
    330 	jnz	.Lenc2x_loop
    331 
    332 	# middle of last round
    333 	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
    334 	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
    335 	 movdqa	%xmm4,	%xmm12
    336 	 movdqa	%xmm0,	%xmm6
    337 	pshufb  %xmm2,  %xmm4	# 4 = sbou
    338 	 pshufb	%xmm8,	%xmm12
    339 	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
    340 	 pxor	%xmm5,	%xmm12
    341 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
    342 	 pshufb	%xmm11,	%xmm6
    343 	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
    344 	 # Also use %xmm1 in the second instance.
    345 	pxor	%xmm4,	%xmm0	# 0 = A
    346 	 pxor	%xmm12,	%xmm6
    347 	pshufb	%xmm1,	%xmm0
    348 	 pshufb	%xmm1,	%xmm6
    349 	ret
    350 .cfi_endproc
    351 .size	_vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
    352 
    353 ##
    354 ##  Decryption core
    355 ##
    356 ##  Same API as encryption core.
    357 ##
    358 .type	_vpaes_decrypt_core,\@abi-omnipotent
    359 .align	16
    360 _vpaes_decrypt_core:
    361 .cfi_startproc
    362 	mov	%rdx,	%r9		# load key
    363 	mov	240(%rdx),%eax
    364 	movdqa	%xmm9,	%xmm1
    365 	movdqa	.Lk_dipt(%rip), %xmm2	# iptlo
    366 	pandn	%xmm0,	%xmm1
    367 	mov	%rax,	%r11
    368 	psrld	\$4,	%xmm1
    369 	movdqu	(%r9),	%xmm5		# round0 key
    370 	shl	\$4,	%r11
    371 	pand	%xmm9,	%xmm0
    372 	pshufb	%xmm0,	%xmm2
    373 	movdqa	.Lk_dipt+16(%rip), %xmm0 # ipthi
    374 	xor	\$0x30,	%r11
    375 	lea	.Lk_dsbd(%rip),%r10
    376 	pshufb	%xmm1,	%xmm0
    377 	and	\$0x30,	%r11
    378 	pxor	%xmm5,	%xmm2
    379 	movdqa	.Lk_mc_forward+48(%rip), %xmm5
    380 	pxor	%xmm2,	%xmm0
    381 	add	\$16,	%r9
    382 	add	%r10,	%r11
    383 	jmp	.Ldec_entry
    384 
    385 .align 16
    386 .Ldec_loop:
    387 ##
    388 ##  Inverse mix columns
    389 ##
    390 	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
    391 	movdqa  -0x10(%r10),%xmm1	# 0 : sb9t
    392 	pshufb	%xmm2,	%xmm4		# 4 = sb9u
    393 	pshufb	%xmm3,	%xmm1		# 0 = sb9t
    394 	pxor	%xmm4,	%xmm0
    395 	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
    396 	pxor	%xmm1,	%xmm0		# 0 = ch
    397 	movdqa  0x10(%r10),%xmm1	# 0 : sbdt
    398 
    399 	pshufb	%xmm2,	%xmm4		# 4 = sbdu
    400 	pshufb	%xmm5,	%xmm0		# MC ch
    401 	pshufb	%xmm3,	%xmm1		# 0 = sbdt
    402 	pxor	%xmm4,	%xmm0		# 4 = ch
    403 	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
    404 	pxor	%xmm1,	%xmm0		# 0 = ch
    405 	movdqa  0x30(%r10),%xmm1	# 0 : sbbt
    406 
    407 	pshufb	%xmm2,	%xmm4		# 4 = sbbu
    408 	pshufb	%xmm5,	%xmm0		# MC ch
    409 	pshufb	%xmm3,	%xmm1		# 0 = sbbt
    410 	pxor	%xmm4,	%xmm0		# 4 = ch
    411 	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
    412 	pxor	%xmm1,	%xmm0		# 0 = ch
    413 	movdqa  0x50(%r10),%xmm1	# 0 : sbet
    414 
    415 	pshufb	%xmm2,	%xmm4		# 4 = sbeu
    416 	pshufb	%xmm5,	%xmm0		# MC ch
    417 	pshufb	%xmm3,	%xmm1		# 0 = sbet
    418 	pxor	%xmm4,	%xmm0		# 4 = ch
    419 	add	\$16, %r9		# next round key
    420 	palignr	\$12,	%xmm5,	%xmm5
    421 	pxor	%xmm1,	%xmm0		# 0 = ch
    422 	sub	\$1,%rax		# nr--
    423 
    424 .Ldec_entry:
    425 	# top of round
    426 	movdqa  %xmm9, 	%xmm1	# 1 : i
    427 	pandn	%xmm0, 	%xmm1	# 1 = i<<4
    428 	movdqa	%xmm11, %xmm2	# 2 : a/k
    429 	psrld	\$4,    %xmm1	# 1 = i
    430 	pand	%xmm9, 	%xmm0	# 0 = k
    431 	pshufb  %xmm0,  %xmm2	# 2 = a/k
    432 	movdqa	%xmm10,	%xmm3	# 3 : 1/i
    433 	pxor	%xmm1,	%xmm0	# 0 = j
    434 	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
    435 	movdqa	%xmm10,	%xmm4	# 4 : 1/j
    436 	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
    437 	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
    438 	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
    439 	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
    440 	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
    441 	movdqa	%xmm10, %xmm3	# 3 : 1/jak
    442 	pxor	%xmm0, 	%xmm2	# 2 = io
    443 	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
    444 	movdqu	(%r9),	%xmm0
    445 	pxor	%xmm1,  %xmm3	# 3 = jo
    446 	jnz	.Ldec_loop
    447 
    448 	# middle of last round
    449 	movdqa	0x60(%r10), %xmm4	# 3 : sbou
    450 	pshufb  %xmm2,  %xmm4	# 4 = sbou
    451 	pxor	%xmm0,  %xmm4	# 4 = sb1u + k
    452 	movdqa	0x70(%r10), %xmm0	# 0 : sbot
    453 	movdqa	-0x160(%r11), %xmm2	# .Lk_sr-.Lk_dsbd=-0x160
    454 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
    455 	pxor	%xmm4,	%xmm0	# 0 = A
    456 	pshufb	%xmm2,	%xmm0
    457 	ret
    458 .cfi_endproc
    459 .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
    460 
    461 ########################################################
    462 ##                                                    ##
    463 ##                  AES key schedule                  ##
    464 ##                                                    ##
    465 ########################################################
    466 .type	_vpaes_schedule_core,\@abi-omnipotent
    467 .align	16
    468 _vpaes_schedule_core:
    469 .cfi_startproc
    470 	# rdi = key
    471 	# rsi = size in bits
    472 	# rdx = buffer
    473 	# rcx = direction.  0=encrypt, 1=decrypt
    474 
    475 	call	_vpaes_preheat		# load the tables
    476 	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
    477 	movdqu	(%rdi),	%xmm0		# load key (unaligned)
    478 
    479 	# input transform
    480 	movdqa	%xmm0,	%xmm3
    481 	lea	.Lk_ipt(%rip), %r11
    482 	call	_vpaes_schedule_transform
    483 	movdqa	%xmm0,	%xmm7
    484 
    485 	lea	.Lk_sr(%rip),%r10
    486 	test	%rcx,	%rcx
    487 	jnz	.Lschedule_am_decrypting
    488 
    489 	# encrypting, output zeroth round key after transform
    490 	movdqu	%xmm0,	(%rdx)
    491 	jmp	.Lschedule_go
    492 
    493 .Lschedule_am_decrypting:
    494 	# decrypting, output zeroth round key after shiftrows
    495 	movdqa	(%r8,%r10),%xmm1
    496 	pshufb  %xmm1,	%xmm3
    497 	movdqu	%xmm3,	(%rdx)
    498 	xor	\$0x30, %r8
    499 
    500 .Lschedule_go:
    501 	cmp	\$192,	%esi
    502 	ja	.Lschedule_256
    503 	je	.Lschedule_192
    504 	# 128: fall though
    505 
    506 ##
    507 ##  .schedule_128
    508 ##
    509 ##  128-bit specific part of key schedule.
    510 ##
    511 ##  This schedule is really simple, because all its parts
    512 ##  are accomplished by the subroutines.
    513 ##
    514 .Lschedule_128:
    515 	mov	\$10, %esi
    516 
    517 .Loop_schedule_128:
    518 	call 	_vpaes_schedule_round
    519 	dec	%rsi
    520 	jz 	.Lschedule_mangle_last
    521 	call	_vpaes_schedule_mangle	# write output
    522 	jmp 	.Loop_schedule_128
    523 
    524 ##
    525 ##  .aes_schedule_192
    526 ##
    527 ##  192-bit specific part of key schedule.
    528 ##
    529 ##  The main body of this schedule is the same as the 128-bit
    530 ##  schedule, but with more smearing.  The long, high side is
    531 ##  stored in %xmm7 as before, and the short, low side is in
    532 ##  the high bits of %xmm6.
    533 ##
    534 ##  This schedule is somewhat nastier, however, because each
    535 ##  round produces 192 bits of key material, or 1.5 round keys.
    536 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
    537 ##  keys.
    538 ##
    539 .align	16
    540 .Lschedule_192:
    541 	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
    542 	call	_vpaes_schedule_transform	# input transform
    543 	movdqa	%xmm0,	%xmm6		# save short part
    544 	pxor	%xmm4,	%xmm4		# clear 4
    545 	movhlps	%xmm4,	%xmm6		# clobber low side with zeros
    546 	mov	\$4,	%esi
    547 
    548 .Loop_schedule_192:
    549 	call	_vpaes_schedule_round
    550 	palignr	\$8,%xmm6,%xmm0
    551 	call	_vpaes_schedule_mangle	# save key n
    552 	call	_vpaes_schedule_192_smear
    553 	call	_vpaes_schedule_mangle	# save key n+1
    554 	call	_vpaes_schedule_round
    555 	dec	%rsi
    556 	jz 	.Lschedule_mangle_last
    557 	call	_vpaes_schedule_mangle	# save key n+2
    558 	call	_vpaes_schedule_192_smear
    559 	jmp	.Loop_schedule_192
    560 
    561 ##
    562 ##  .aes_schedule_256
    563 ##
    564 ##  256-bit specific part of key schedule.
    565 ##
    566 ##  The structure here is very similar to the 128-bit
    567 ##  schedule, but with an additional "low side" in
    568 ##  %xmm6.  The low side's rounds are the same as the
    569 ##  high side's, except no rcon and no rotation.
    570 ##
    571 .align	16
    572 .Lschedule_256:
    573 	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
    574 	call	_vpaes_schedule_transform	# input transform
    575 	mov	\$7, %esi
    576 
    577 .Loop_schedule_256:
    578 	call	_vpaes_schedule_mangle	# output low result
    579 	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
    580 
    581 	# high round
    582 	call	_vpaes_schedule_round
    583 	dec	%rsi
    584 	jz 	.Lschedule_mangle_last
    585 	call	_vpaes_schedule_mangle
    586 
    587 	# low round. swap xmm7 and xmm6
    588 	pshufd	\$0xFF,	%xmm0,	%xmm0
    589 	movdqa	%xmm7,	%xmm5
    590 	movdqa	%xmm6,	%xmm7
    591 	call	_vpaes_schedule_low_round
    592 	movdqa	%xmm5,	%xmm7
    593 
    594 	jmp	.Loop_schedule_256
    595 
    596 
    597 ##
    598 ##  .aes_schedule_mangle_last
    599 ##
    600 ##  Mangler for last round of key schedule
    601 ##  Mangles %xmm0
    602 ##    when encrypting, outputs out(%xmm0) ^ 63
    603 ##    when decrypting, outputs unskew(%xmm0)
    604 ##
    605 ##  Always called right before return... jumps to cleanup and exits
    606 ##
    607 .align	16
    608 .Lschedule_mangle_last:
    609 	# schedule last round key from xmm0
    610 	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
    611 	test	%rcx, 	%rcx
    612 	jnz	.Lschedule_mangle_last_dec
    613 
    614 	# encrypting
    615 	movdqa	(%r8,%r10),%xmm1
    616 	pshufb	%xmm1,	%xmm0		# output permute
    617 	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
    618 	add	\$32,	%rdx
    619 
    620 .Lschedule_mangle_last_dec:
    621 	add	\$-16,	%rdx
    622 	pxor	.Lk_s63(%rip),	%xmm0
    623 	call	_vpaes_schedule_transform # output transform
    624 	movdqu	%xmm0,	(%rdx)		# save last key
    625 
    626 	# cleanup
    627 	pxor	%xmm0,  %xmm0
    628 	pxor	%xmm1,  %xmm1
    629 	pxor	%xmm2,  %xmm2
    630 	pxor	%xmm3,  %xmm3
    631 	pxor	%xmm4,  %xmm4
    632 	pxor	%xmm5,  %xmm5
    633 	pxor	%xmm6,  %xmm6
    634 	pxor	%xmm7,  %xmm7
    635 	ret
    636 .cfi_endproc
    637 .size	_vpaes_schedule_core,.-_vpaes_schedule_core
    638 
    639 ##
    640 ##  .aes_schedule_192_smear
    641 ##
    642 ##  Smear the short, low side in the 192-bit key schedule.
    643 ##
    644 ##  Inputs:
    645 ##    %xmm7: high side, b  a  x  y
    646 ##    %xmm6:  low side, d  c  0  0
    647 ##    %xmm13: 0
    648 ##
    649 ##  Outputs:
    650 ##    %xmm6: b+c+d  b+c  0  0
    651 ##    %xmm0: b+c+d  b+c  b  a
    652 ##
    653 .type	_vpaes_schedule_192_smear,\@abi-omnipotent
    654 .align	16
    655 _vpaes_schedule_192_smear:
    656 .cfi_startproc
    657 	pshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
    658 	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
    659 	pxor	%xmm1,	%xmm6		# -> c+d c 0 0
    660 	pxor	%xmm1,	%xmm1
    661 	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
    662 	movdqa	%xmm6,	%xmm0
    663 	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
    664 	ret
    665 .cfi_endproc
    666 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
    667 
    668 ##
    669 ##  .aes_schedule_round
    670 ##
    671 ##  Runs one main round of the key schedule on %xmm0, %xmm7
    672 ##
    673 ##  Specifically, runs subbytes on the high dword of %xmm0
    674 ##  then rotates it by one byte and xors into the low dword of
    675 ##  %xmm7.
    676 ##
    677 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
    678 ##  next rcon.
    679 ##
    680 ##  Smears the dwords of %xmm7 by xoring the low into the
    681 ##  second low, result into third, result into highest.
    682 ##
    683 ##  Returns results in %xmm7 = %xmm0.
    684 ##  Clobbers %xmm1-%xmm4, %r11.
    685 ##
    686 .type	_vpaes_schedule_round,\@abi-omnipotent
    687 .align	16
    688 _vpaes_schedule_round:
    689 .cfi_startproc
    690 	# extract rcon from xmm8
    691 	pxor	%xmm1,	%xmm1
    692 	palignr	\$15,	%xmm8,	%xmm1
    693 	palignr	\$15,	%xmm8,	%xmm8
    694 	pxor	%xmm1,	%xmm7
    695 
    696 	# rotate
    697 	pshufd	\$0xFF,	%xmm0,	%xmm0
    698 	palignr	\$1,	%xmm0,	%xmm0
    699 
    700 	# fall through...
    701 
    702 	# low round: same as high round, but no rotation and no rcon.
    703 _vpaes_schedule_low_round:
    704 	# smear xmm7
    705 	movdqa	%xmm7,	%xmm1
    706 	pslldq	\$4,	%xmm7
    707 	pxor	%xmm1,	%xmm7
    708 	movdqa	%xmm7,	%xmm1
    709 	pslldq	\$8,	%xmm7
    710 	pxor	%xmm1,	%xmm7
    711 	pxor	.Lk_s63(%rip), %xmm7
    712 
    713 	# subbytes
    714 	movdqa  %xmm9, 	%xmm1
    715 	pandn	%xmm0, 	%xmm1
    716 	psrld	\$4,    %xmm1		# 1 = i
    717 	pand	%xmm9, 	%xmm0		# 0 = k
    718 	movdqa	%xmm11, %xmm2		# 2 : a/k
    719 	pshufb  %xmm0,  %xmm2		# 2 = a/k
    720 	pxor	%xmm1,	%xmm0		# 0 = j
    721 	movdqa	%xmm10,	%xmm3		# 3 : 1/i
    722 	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
    723 	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
    724 	movdqa	%xmm10,	%xmm4		# 4 : 1/j
    725 	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
    726 	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
    727 	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
    728 	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
    729 	pxor	%xmm0, 	%xmm2		# 2 = io
    730 	movdqa	%xmm10, %xmm3		# 3 : 1/jak
    731 	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
    732 	pxor	%xmm1,  %xmm3		# 3 = jo
    733 	movdqa	%xmm13, %xmm4		# 4 : sbou
    734 	pshufb  %xmm2,  %xmm4		# 4 = sbou
    735 	movdqa	%xmm12, %xmm0		# 0 : sbot
    736 	pshufb  %xmm3,	%xmm0		# 0 = sb1t
    737 	pxor	%xmm4, 	%xmm0		# 0 = sbox output
    738 
    739 	# add in smeared stuff
    740 	pxor	%xmm7,	%xmm0
    741 	movdqa	%xmm0,	%xmm7
    742 	ret
    743 .cfi_endproc
    744 .size	_vpaes_schedule_round,.-_vpaes_schedule_round
    745 
    746 ##
    747 ##  .aes_schedule_transform
    748 ##
    749 ##  Linear-transform %xmm0 according to tables at (%r11)
    750 ##
    751 ##  Requires that %xmm9 = 0x0F0F... as in preheat
    752 ##  Output in %xmm0
    753 ##  Clobbers %xmm1, %xmm2
    754 ##
    755 .type	_vpaes_schedule_transform,\@abi-omnipotent
    756 .align	16
    757 _vpaes_schedule_transform:
    758 .cfi_startproc
    759 	movdqa	%xmm9,	%xmm1
    760 	pandn	%xmm0,	%xmm1
    761 	psrld	\$4,	%xmm1
    762 	pand	%xmm9,	%xmm0
    763 	movdqa	(%r11), %xmm2 	# lo
    764 	pshufb	%xmm0,	%xmm2
    765 	movdqa	16(%r11), %xmm0 # hi
    766 	pshufb	%xmm1,	%xmm0
    767 	pxor	%xmm2,	%xmm0
    768 	ret
    769 .cfi_endproc
    770 .size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
    771 
    772 ##
    773 ##  .aes_schedule_mangle
    774 ##
    775 ##  Mangle xmm0 from (basis-transformed) standard version
    776 ##  to our version.
    777 ##
    778 ##  On encrypt,
    779 ##    xor with 0x63
    780 ##    multiply by circulant 0,1,1,1
    781 ##    apply shiftrows transform
    782 ##
    783 ##  On decrypt,
    784 ##    xor with 0x63
    785 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
    786 ##    deskew
    787 ##    apply shiftrows transform
    788 ##
    789 ##
    790 ##  Writes out to (%rdx), and increments or decrements it
    791 ##  Keeps track of round number mod 4 in %r8
    792 ##  Preserves xmm0
    793 ##  Clobbers xmm1-xmm5
    794 ##
    795 .type	_vpaes_schedule_mangle,\@abi-omnipotent
    796 .align	16
    797 _vpaes_schedule_mangle:
    798 .cfi_startproc
    799 	movdqa	%xmm0,	%xmm4	# save xmm0 for later
    800 	movdqa	.Lk_mc_forward(%rip),%xmm5
    801 	test	%rcx, 	%rcx
    802 	jnz	.Lschedule_mangle_dec
    803 
    804 	# encrypting
    805 	add	\$16,	%rdx
    806 	pxor	.Lk_s63(%rip),%xmm4
    807 	pshufb	%xmm5,	%xmm4
    808 	movdqa	%xmm4,	%xmm3
    809 	pshufb	%xmm5,	%xmm4
    810 	pxor	%xmm4,	%xmm3
    811 	pshufb	%xmm5,	%xmm4
    812 	pxor	%xmm4,	%xmm3
    813 
    814 	jmp	.Lschedule_mangle_both
    815 .align	16
    816 .Lschedule_mangle_dec:
    817 	# inverse mix columns
    818 	lea	.Lk_dksd(%rip),%r11
    819 	movdqa	%xmm9,	%xmm1
    820 	pandn	%xmm4,	%xmm1
    821 	psrld	\$4,	%xmm1	# 1 = hi
    822 	pand	%xmm9,	%xmm4	# 4 = lo
    823 
    824 	movdqa	0x00(%r11), %xmm2
    825 	pshufb	%xmm4,	%xmm2
    826 	movdqa	0x10(%r11), %xmm3
    827 	pshufb	%xmm1,	%xmm3
    828 	pxor	%xmm2,	%xmm3
    829 	pshufb	%xmm5,	%xmm3
    830 
    831 	movdqa	0x20(%r11), %xmm2
    832 	pshufb	%xmm4,	%xmm2
    833 	pxor	%xmm3,	%xmm2
    834 	movdqa	0x30(%r11), %xmm3
    835 	pshufb	%xmm1,	%xmm3
    836 	pxor	%xmm2,	%xmm3
    837 	pshufb	%xmm5,	%xmm3
    838 
    839 	movdqa	0x40(%r11), %xmm2
    840 	pshufb	%xmm4,	%xmm2
    841 	pxor	%xmm3,	%xmm2
    842 	movdqa	0x50(%r11), %xmm3
    843 	pshufb	%xmm1,	%xmm3
    844 	pxor	%xmm2,	%xmm3
    845 	pshufb	%xmm5,	%xmm3
    846 
    847 	movdqa	0x60(%r11), %xmm2
    848 	pshufb	%xmm4,	%xmm2
    849 	pxor	%xmm3,	%xmm2
    850 	movdqa	0x70(%r11), %xmm3
    851 	pshufb	%xmm1,	%xmm3
    852 	pxor	%xmm2,	%xmm3
    853 
    854 	add	\$-16,	%rdx
    855 
    856 .Lschedule_mangle_both:
    857 	movdqa	(%r8,%r10),%xmm1
    858 	pshufb	%xmm1,%xmm3
    859 	add	\$-16,	%r8
    860 	and	\$0x30,	%r8
    861 	movdqu	%xmm3,	(%rdx)
    862 	ret
    863 .cfi_endproc
    864 .size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
    865 
    866 #
    867 # Interface to OpenSSL
    868 #
    869 .globl	${PREFIX}_set_encrypt_key
    870 .type	${PREFIX}_set_encrypt_key,\@function,3
    871 .align	16
    872 ${PREFIX}_set_encrypt_key:
    873 .cfi_startproc
    874 #ifndef NDEBUG
    875 #ifndef BORINGSSL_FIPS
    876 .extern        BORINGSSL_function_hit
    877        movb \$1, BORINGSSL_function_hit+5(%rip)
    878 #endif
    879 #endif
    880 
    881 ___
    882 $code.=<<___ if ($win64);
    883 	lea	-0xb8(%rsp),%rsp
    884 	movaps	%xmm6,0x10(%rsp)
    885 	movaps	%xmm7,0x20(%rsp)
    886 	movaps	%xmm8,0x30(%rsp)
    887 	movaps	%xmm9,0x40(%rsp)
    888 	movaps	%xmm10,0x50(%rsp)
    889 	movaps	%xmm11,0x60(%rsp)
    890 	movaps	%xmm12,0x70(%rsp)
    891 	movaps	%xmm13,0x80(%rsp)
    892 	movaps	%xmm14,0x90(%rsp)
    893 	movaps	%xmm15,0xa0(%rsp)
    894 .Lenc_key_body:
    895 ___
    896 $code.=<<___;
    897 	mov	%esi,%eax
    898 	shr	\$5,%eax
    899 	add	\$5,%eax
    900 	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
    901 
    902 	mov	\$0,%ecx
    903 	mov	\$0x30,%r8d
    904 	call	_vpaes_schedule_core
    905 ___
    906 $code.=<<___ if ($win64);
    907 	movaps	0x10(%rsp),%xmm6
    908 	movaps	0x20(%rsp),%xmm7
    909 	movaps	0x30(%rsp),%xmm8
    910 	movaps	0x40(%rsp),%xmm9
    911 	movaps	0x50(%rsp),%xmm10
    912 	movaps	0x60(%rsp),%xmm11
    913 	movaps	0x70(%rsp),%xmm12
    914 	movaps	0x80(%rsp),%xmm13
    915 	movaps	0x90(%rsp),%xmm14
    916 	movaps	0xa0(%rsp),%xmm15
    917 	lea	0xb8(%rsp),%rsp
    918 .Lenc_key_epilogue:
    919 ___
    920 $code.=<<___;
    921 	xor	%eax,%eax
    922 	ret
    923 .cfi_endproc
    924 .size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
    925 
    926 .globl	${PREFIX}_set_decrypt_key
    927 .type	${PREFIX}_set_decrypt_key,\@function,3
    928 .align	16
    929 ${PREFIX}_set_decrypt_key:
    930 .cfi_startproc
    931 ___
    932 $code.=<<___ if ($win64);
    933 	lea	-0xb8(%rsp),%rsp
    934 	movaps	%xmm6,0x10(%rsp)
    935 	movaps	%xmm7,0x20(%rsp)
    936 	movaps	%xmm8,0x30(%rsp)
    937 	movaps	%xmm9,0x40(%rsp)
    938 	movaps	%xmm10,0x50(%rsp)
    939 	movaps	%xmm11,0x60(%rsp)
    940 	movaps	%xmm12,0x70(%rsp)
    941 	movaps	%xmm13,0x80(%rsp)
    942 	movaps	%xmm14,0x90(%rsp)
    943 	movaps	%xmm15,0xa0(%rsp)
    944 .Ldec_key_body:
    945 ___
    946 $code.=<<___;
    947 	mov	%esi,%eax
    948 	shr	\$5,%eax
    949 	add	\$5,%eax
    950 	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
    951 	shl	\$4,%eax
    952 	lea	16(%rdx,%rax),%rdx
    953 
    954 	mov	\$1,%ecx
    955 	mov	%esi,%r8d
    956 	shr	\$1,%r8d
    957 	and	\$32,%r8d
    958 	xor	\$32,%r8d	# nbits==192?0:32
    959 	call	_vpaes_schedule_core
    960 ___
    961 $code.=<<___ if ($win64);
    962 	movaps	0x10(%rsp),%xmm6
    963 	movaps	0x20(%rsp),%xmm7
    964 	movaps	0x30(%rsp),%xmm8
    965 	movaps	0x40(%rsp),%xmm9
    966 	movaps	0x50(%rsp),%xmm10
    967 	movaps	0x60(%rsp),%xmm11
    968 	movaps	0x70(%rsp),%xmm12
    969 	movaps	0x80(%rsp),%xmm13
    970 	movaps	0x90(%rsp),%xmm14
    971 	movaps	0xa0(%rsp),%xmm15
    972 	lea	0xb8(%rsp),%rsp
    973 .Ldec_key_epilogue:
    974 ___
    975 $code.=<<___;
    976 	xor	%eax,%eax
    977 	ret
    978 .cfi_endproc
    979 .size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
    980 
    981 .globl	${PREFIX}_encrypt
    982 .type	${PREFIX}_encrypt,\@function,3
    983 .align	16
    984 ${PREFIX}_encrypt:
    985 .cfi_startproc
    986 #ifndef NDEBUG
    987 #ifndef BORINGSSL_FIPS
    988 .extern        BORINGSSL_function_hit
    989        movb \$1, BORINGSSL_function_hit+4(%rip)
    990 #endif
    991 #endif
    992 ___
    993 $code.=<<___ if ($win64);
    994 	lea	-0xb8(%rsp),%rsp
    995 	movaps	%xmm6,0x10(%rsp)
    996 	movaps	%xmm7,0x20(%rsp)
    997 	movaps	%xmm8,0x30(%rsp)
    998 	movaps	%xmm9,0x40(%rsp)
    999 	movaps	%xmm10,0x50(%rsp)
   1000 	movaps	%xmm11,0x60(%rsp)
   1001 	movaps	%xmm12,0x70(%rsp)
   1002 	movaps	%xmm13,0x80(%rsp)
   1003 	movaps	%xmm14,0x90(%rsp)
   1004 	movaps	%xmm15,0xa0(%rsp)
   1005 .Lenc_body:
   1006 ___
   1007 $code.=<<___;
   1008 	movdqu	(%rdi),%xmm0
   1009 	call	_vpaes_preheat
   1010 	call	_vpaes_encrypt_core
   1011 	movdqu	%xmm0,(%rsi)
   1012 ___
   1013 $code.=<<___ if ($win64);
   1014 	movaps	0x10(%rsp),%xmm6
   1015 	movaps	0x20(%rsp),%xmm7
   1016 	movaps	0x30(%rsp),%xmm8
   1017 	movaps	0x40(%rsp),%xmm9
   1018 	movaps	0x50(%rsp),%xmm10
   1019 	movaps	0x60(%rsp),%xmm11
   1020 	movaps	0x70(%rsp),%xmm12
   1021 	movaps	0x80(%rsp),%xmm13
   1022 	movaps	0x90(%rsp),%xmm14
   1023 	movaps	0xa0(%rsp),%xmm15
   1024 	lea	0xb8(%rsp),%rsp
   1025 .Lenc_epilogue:
   1026 ___
   1027 $code.=<<___;
   1028 	ret
   1029 .cfi_endproc
   1030 .size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
   1031 
   1032 .globl	${PREFIX}_decrypt
   1033 .type	${PREFIX}_decrypt,\@function,3
   1034 .align	16
   1035 ${PREFIX}_decrypt:
   1036 .cfi_startproc
   1037 ___
   1038 $code.=<<___ if ($win64);
   1039 	lea	-0xb8(%rsp),%rsp
   1040 	movaps	%xmm6,0x10(%rsp)
   1041 	movaps	%xmm7,0x20(%rsp)
   1042 	movaps	%xmm8,0x30(%rsp)
   1043 	movaps	%xmm9,0x40(%rsp)
   1044 	movaps	%xmm10,0x50(%rsp)
   1045 	movaps	%xmm11,0x60(%rsp)
   1046 	movaps	%xmm12,0x70(%rsp)
   1047 	movaps	%xmm13,0x80(%rsp)
   1048 	movaps	%xmm14,0x90(%rsp)
   1049 	movaps	%xmm15,0xa0(%rsp)
   1050 .Ldec_body:
   1051 ___
   1052 $code.=<<___;
   1053 	movdqu	(%rdi),%xmm0
   1054 	call	_vpaes_preheat
   1055 	call	_vpaes_decrypt_core
   1056 	movdqu	%xmm0,(%rsi)
   1057 ___
   1058 $code.=<<___ if ($win64);
   1059 	movaps	0x10(%rsp),%xmm6
   1060 	movaps	0x20(%rsp),%xmm7
   1061 	movaps	0x30(%rsp),%xmm8
   1062 	movaps	0x40(%rsp),%xmm9
   1063 	movaps	0x50(%rsp),%xmm10
   1064 	movaps	0x60(%rsp),%xmm11
   1065 	movaps	0x70(%rsp),%xmm12
   1066 	movaps	0x80(%rsp),%xmm13
   1067 	movaps	0x90(%rsp),%xmm14
   1068 	movaps	0xa0(%rsp),%xmm15
   1069 	lea	0xb8(%rsp),%rsp
   1070 .Ldec_epilogue:
   1071 ___
   1072 $code.=<<___;
   1073 	ret
   1074 .cfi_endproc
   1075 .size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
   1076 ___
   1077 {
   1078 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
   1079 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
   1080 #                       size_t length, const AES_KEY *key,
   1081 #                       unsigned char *ivp,const int enc);
   1082 $code.=<<___;
   1083 .globl	${PREFIX}_cbc_encrypt
   1084 .type	${PREFIX}_cbc_encrypt,\@function,6
   1085 .align	16
   1086 ${PREFIX}_cbc_encrypt:
   1087 .cfi_startproc
   1088 	xchg	$key,$len
   1089 ___
   1090 ($len,$key)=($key,$len);
   1091 $code.=<<___;
   1092 	sub	\$16,$len
   1093 	jc	.Lcbc_abort
   1094 ___
   1095 $code.=<<___ if ($win64);
   1096 	lea	-0xb8(%rsp),%rsp
   1097 	movaps	%xmm6,0x10(%rsp)
   1098 	movaps	%xmm7,0x20(%rsp)
   1099 	movaps	%xmm8,0x30(%rsp)
   1100 	movaps	%xmm9,0x40(%rsp)
   1101 	movaps	%xmm10,0x50(%rsp)
   1102 	movaps	%xmm11,0x60(%rsp)
   1103 	movaps	%xmm12,0x70(%rsp)
   1104 	movaps	%xmm13,0x80(%rsp)
   1105 	movaps	%xmm14,0x90(%rsp)
   1106 	movaps	%xmm15,0xa0(%rsp)
   1107 .Lcbc_body:
   1108 ___
   1109 $code.=<<___;
   1110 	movdqu	($ivp),%xmm6		# load IV
   1111 	sub	$inp,$out
   1112 	call	_vpaes_preheat
   1113 	cmp	\$0,${enc}d
   1114 	je	.Lcbc_dec_loop
   1115 	jmp	.Lcbc_enc_loop
   1116 .align	16
   1117 .Lcbc_enc_loop:
   1118 	movdqu	($inp),%xmm0
   1119 	pxor	%xmm6,%xmm0
   1120 	call	_vpaes_encrypt_core
   1121 	movdqa	%xmm0,%xmm6
   1122 	movdqu	%xmm0,($out,$inp)
   1123 	lea	16($inp),$inp
   1124 	sub	\$16,$len
   1125 	jnc	.Lcbc_enc_loop
   1126 	jmp	.Lcbc_done
   1127 .align	16
   1128 .Lcbc_dec_loop:
   1129 	movdqu	($inp),%xmm0
   1130 	movdqa	%xmm0,%xmm7
   1131 	call	_vpaes_decrypt_core
   1132 	pxor	%xmm6,%xmm0
   1133 	movdqa	%xmm7,%xmm6
   1134 	movdqu	%xmm0,($out,$inp)
   1135 	lea	16($inp),$inp
   1136 	sub	\$16,$len
   1137 	jnc	.Lcbc_dec_loop
   1138 .Lcbc_done:
   1139 	movdqu	%xmm6,($ivp)		# save IV
   1140 ___
   1141 $code.=<<___ if ($win64);
   1142 	movaps	0x10(%rsp),%xmm6
   1143 	movaps	0x20(%rsp),%xmm7
   1144 	movaps	0x30(%rsp),%xmm8
   1145 	movaps	0x40(%rsp),%xmm9
   1146 	movaps	0x50(%rsp),%xmm10
   1147 	movaps	0x60(%rsp),%xmm11
   1148 	movaps	0x70(%rsp),%xmm12
   1149 	movaps	0x80(%rsp),%xmm13
   1150 	movaps	0x90(%rsp),%xmm14
   1151 	movaps	0xa0(%rsp),%xmm15
   1152 	lea	0xb8(%rsp),%rsp
   1153 .Lcbc_epilogue:
   1154 ___
   1155 $code.=<<___;
   1156 .Lcbc_abort:
   1157 	ret
   1158 .cfi_endproc
   1159 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
   1160 ___
   1161 }
   1162 {
   1163 my ($inp,$out,$blocks,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx","%r8");
   1164 # void vpaes_ctr32_encrypt_blocks(const uint8_t *inp, uint8_t *out,
   1165 #                                 size_t blocks, const AES_KEY *key,
   1166 #                                 const uint8_t ivp[16]);
   1167 $code.=<<___;
   1168 .globl	${PREFIX}_ctr32_encrypt_blocks
   1169 .type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
   1170 .align	16
   1171 ${PREFIX}_ctr32_encrypt_blocks:
   1172 .cfi_startproc
   1173 	# _vpaes_encrypt_core and _vpaes_encrypt_core_2x expect the key in %rdx.
   1174 	xchg	$key, $blocks
   1175 ___
   1176 ($blocks,$key)=($key,$blocks);
   1177 $code.=<<___;
   1178 	test	$blocks, $blocks
   1179 	jz	.Lctr32_abort
   1180 ___
   1181 $code.=<<___ if ($win64);
   1182 	lea	-0xb8(%rsp),%rsp
   1183 	movaps	%xmm6,0x10(%rsp)
   1184 	movaps	%xmm7,0x20(%rsp)
   1185 	movaps	%xmm8,0x30(%rsp)
   1186 	movaps	%xmm9,0x40(%rsp)
   1187 	movaps	%xmm10,0x50(%rsp)
   1188 	movaps	%xmm11,0x60(%rsp)
   1189 	movaps	%xmm12,0x70(%rsp)
   1190 	movaps	%xmm13,0x80(%rsp)
   1191 	movaps	%xmm14,0x90(%rsp)
   1192 	movaps	%xmm15,0xa0(%rsp)
   1193 .Lctr32_body:
   1194 ___
   1195 $code.=<<___;
   1196 	movdqu	($ivp), %xmm0		# Load IV.
   1197 	movdqa	.Lctr_add_one(%rip), %xmm8
   1198 	sub	$inp, $out		# This allows only incrementing $inp.
   1199 	call	_vpaes_preheat
   1200 	movdqa	%xmm0, %xmm6
   1201 	pshufb	.Lrev_ctr(%rip), %xmm6
   1202 
   1203 	test	\$1, $blocks
   1204 	jz	.Lctr32_prep_loop
   1205 
   1206 	# Handle one block so the remaining block count is even for
   1207 	# _vpaes_encrypt_core_2x.
   1208 	movdqu	($inp), %xmm7		# Load input.
   1209 	call	_vpaes_encrypt_core
   1210 	pxor	%xmm7, %xmm0
   1211 	paddd	%xmm8, %xmm6
   1212 	movdqu	%xmm0, ($out,$inp)
   1213 	sub	\$1, $blocks
   1214 	lea	16($inp), $inp
   1215 	jz	.Lctr32_done
   1216 
   1217 .Lctr32_prep_loop:
   1218 	# _vpaes_encrypt_core_2x leaves only %xmm14 and %xmm15 as spare
   1219 	# registers. We maintain two byte-swapped counters in them.
   1220 	movdqa	%xmm6, %xmm14
   1221 	movdqa	%xmm6, %xmm15
   1222 	paddd	%xmm8, %xmm15
   1223 
   1224 .Lctr32_loop:
   1225 	movdqa	.Lrev_ctr(%rip), %xmm1	# Set up counters.
   1226 	movdqa	%xmm14, %xmm0
   1227 	movdqa	%xmm15, %xmm6
   1228 	pshufb	%xmm1, %xmm0
   1229 	pshufb	%xmm1, %xmm6
   1230 	call	_vpaes_encrypt_core_2x
   1231 	movdqu	($inp), %xmm1		# Load input.
   1232 	movdqu	16($inp), %xmm2
   1233 	movdqa	.Lctr_add_two(%rip), %xmm3
   1234 	pxor	%xmm1, %xmm0		# XOR input.
   1235 	pxor	%xmm2, %xmm6
   1236 	paddd	%xmm3, %xmm14		# Increment counters.
   1237 	paddd	%xmm3, %xmm15
   1238 	movdqu	%xmm0, ($out,$inp)	# Write output.
   1239 	movdqu	%xmm6, 16($out,$inp)
   1240 	sub	\$2, $blocks		# Advance loop.
   1241 	lea	32($inp), $inp
   1242 	jnz	.Lctr32_loop
   1243 
   1244 .Lctr32_done:
   1245 ___
   1246 $code.=<<___ if ($win64);
   1247 	movaps	0x10(%rsp),%xmm6
   1248 	movaps	0x20(%rsp),%xmm7
   1249 	movaps	0x30(%rsp),%xmm8
   1250 	movaps	0x40(%rsp),%xmm9
   1251 	movaps	0x50(%rsp),%xmm10
   1252 	movaps	0x60(%rsp),%xmm11
   1253 	movaps	0x70(%rsp),%xmm12
   1254 	movaps	0x80(%rsp),%xmm13
   1255 	movaps	0x90(%rsp),%xmm14
   1256 	movaps	0xa0(%rsp),%xmm15
   1257 	lea	0xb8(%rsp),%rsp
   1258 .Lctr32_epilogue:
   1259 ___
   1260 $code.=<<___;
   1261 .Lctr32_abort:
   1262 	ret
   1263 .cfi_endproc
   1264 .size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
   1265 ___
   1266 }
   1267 $code.=<<___;
   1268 ##
   1269 ##  _aes_preheat
   1270 ##
   1271 ##  Fills register %r10 -> .aes_consts (so you can -fPIC)
   1272 ##  and %xmm9-%xmm15 as specified below.
   1273 ##
   1274 .type	_vpaes_preheat,\@abi-omnipotent
   1275 .align	16
   1276 _vpaes_preheat:
   1277 .cfi_startproc
   1278 	lea	.Lk_s0F(%rip), %r10
   1279 	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
   1280 	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
   1281 	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
   1282 	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
   1283 	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
   1284 	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
   1285 	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
   1286 	ret
   1287 .cfi_endproc
   1288 .size	_vpaes_preheat,.-_vpaes_preheat
   1289 ########################################################
   1290 ##                                                    ##
   1291 ##                     Constants                      ##
   1292 ##                                                    ##
   1293 ########################################################
   1294 .type	_vpaes_consts,\@object
   1295 .align	64
   1296 _vpaes_consts:
   1297 .Lk_inv:	# inv, inva
   1298 	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
   1299 	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
   1300 
   1301 .Lk_s0F:	# s0F
   1302 	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
   1303 
   1304 .Lk_ipt:	# input transform (lo, hi)
   1305 	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
   1306 	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
   1307 
   1308 .Lk_sb1:	# sb1u, sb1t
   1309 	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
   1310 	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
   1311 .Lk_sb2:	# sb2u, sb2t
   1312 	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
   1313 	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
   1314 .Lk_sbo:	# sbou, sbot
   1315 	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
   1316 	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
   1317 
   1318 .Lk_mc_forward:	# mc_forward
   1319 	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
   1320 	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
   1321 	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
   1322 	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
   1323 
   1324 .Lk_mc_backward:# mc_backward
   1325 	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
   1326 	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
   1327 	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
   1328 	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
   1329 
   1330 .Lk_sr:		# sr
   1331 	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
   1332 	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
   1333 	.quad	0x0F060D040B020900, 0x070E050C030A0108
   1334 	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
   1335 
   1336 .Lk_rcon:	# rcon
   1337 	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
   1338 
   1339 .Lk_s63:	# s63: all equal to 0x63 transformed
   1340 	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
   1341 
   1342 .Lk_opt:	# output transform
   1343 	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
   1344 	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
   1345 
   1346 .Lk_deskew:	# deskew tables: inverts the sbox's "skew"
   1347 	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
   1348 	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
   1349 
   1350 ##
   1351 ##  Decryption stuff
   1352 ##  Key schedule constants
   1353 ##
   1354 .Lk_dksd:	# decryption key schedule: invskew x*D
   1355 	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
   1356 	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
   1357 .Lk_dksb:	# decryption key schedule: invskew x*B
   1358 	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
   1359 	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
   1360 .Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
   1361 	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
   1362 	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
   1363 .Lk_dks9:	# decryption key schedule: invskew x*9
   1364 	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
   1365 	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
   1366 
   1367 ##
   1368 ##  Decryption stuff
   1369 ##  Round function constants
   1370 ##
   1371 .Lk_dipt:	# decryption input transform
   1372 	.quad	0x0F505B040B545F00, 0x154A411E114E451A
   1373 	.quad	0x86E383E660056500, 0x12771772F491F194
   1374 
   1375 .Lk_dsb9:	# decryption sbox output *9*u, *9*t
   1376 	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
   1377 	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
   1378 .Lk_dsbd:	# decryption sbox output *D*u, *D*t
   1379 	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
   1380 	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
   1381 .Lk_dsbb:	# decryption sbox output *B*u, *B*t
   1382 	.quad	0xD022649296B44200, 0x602646F6B0F2D404
   1383 	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
   1384 .Lk_dsbe:	# decryption sbox output *E*u, *E*t
   1385 	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
   1386 	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
   1387 .Lk_dsbo:	# decryption sbox final output
   1388 	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
   1389 	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
   1390 
   1391 # .Lrev_ctr is a permutation which byte-swaps the counter portion of the IV.
   1392 .Lrev_ctr:
   1393 	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
   1394 # .Lctr_add_* may be added to a byte-swapped xmm register to increment the
   1395 # counter. The register must be byte-swapped again to form the actual input.
   1396 .Lctr_add_one:
   1397 	.quad	0x0000000000000000, 0x0000000100000000
   1398 .Lctr_add_two:
   1399 	.quad	0x0000000000000000, 0x0000000200000000
   1400 
   1401 .asciz	"Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
   1402 .align	64
   1403 .size	_vpaes_consts,.-_vpaes_consts
   1404 ___
   1405 
   1406 if ($win64) {
   1407 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1408 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1409 $rec="%rcx";
   1410 $frame="%rdx";
   1411 $context="%r8";
   1412 $disp="%r9";
   1413 
   1414 $code.=<<___;
   1415 .extern	__imp_RtlVirtualUnwind
   1416 .type	se_handler,\@abi-omnipotent
   1417 .align	16
   1418 se_handler:
   1419 	push	%rsi
   1420 	push	%rdi
   1421 	push	%rbx
   1422 	push	%rbp
   1423 	push	%r12
   1424 	push	%r13
   1425 	push	%r14
   1426 	push	%r15
   1427 	pushfq
   1428 	sub	\$64,%rsp
   1429 
   1430 	mov	120($context),%rax	# pull context->Rax
   1431 	mov	248($context),%rbx	# pull context->Rip
   1432 
   1433 	mov	8($disp),%rsi		# disp->ImageBase
   1434 	mov	56($disp),%r11		# disp->HandlerData
   1435 
   1436 	mov	0(%r11),%r10d		# HandlerData[0]
   1437 	lea	(%rsi,%r10),%r10	# prologue label
   1438 	cmp	%r10,%rbx		# context->Rip<prologue label
   1439 	jb	.Lin_prologue
   1440 
   1441 	mov	152($context),%rax	# pull context->Rsp
   1442 
   1443 	mov	4(%r11),%r10d		# HandlerData[1]
   1444 	lea	(%rsi,%r10),%r10	# epilogue label
   1445 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1446 	jae	.Lin_prologue
   1447 
   1448 	lea	16(%rax),%rsi		# %xmm save area
   1449 	lea	512($context),%rdi	# &context.Xmm6
   1450 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
   1451 	.long	0xa548f3fc		# cld; rep movsq
   1452 	lea	0xb8(%rax),%rax		# adjust stack pointer
   1453 
   1454 .Lin_prologue:
   1455 	mov	8(%rax),%rdi
   1456 	mov	16(%rax),%rsi
   1457 	mov	%rax,152($context)	# restore context->Rsp
   1458 	mov	%rsi,168($context)	# restore context->Rsi
   1459 	mov	%rdi,176($context)	# restore context->Rdi
   1460 
   1461 	mov	40($disp),%rdi		# disp->ContextRecord
   1462 	mov	$context,%rsi		# context
   1463 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   1464 	.long	0xa548f3fc		# cld; rep movsq
   1465 
   1466 	mov	$disp,%rsi
   1467 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1468 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1469 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1470 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1471 	mov	40(%rsi),%r10		# disp->ContextRecord
   1472 	lea	56(%rsi),%r11		# &disp->HandlerData
   1473 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1474 	mov	%r10,32(%rsp)		# arg5
   1475 	mov	%r11,40(%rsp)		# arg6
   1476 	mov	%r12,48(%rsp)		# arg7
   1477 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1478 	call	*__imp_RtlVirtualUnwind(%rip)
   1479 
   1480 	mov	\$1,%eax		# ExceptionContinueSearch
   1481 	add	\$64,%rsp
   1482 	popfq
   1483 	pop	%r15
   1484 	pop	%r14
   1485 	pop	%r13
   1486 	pop	%r12
   1487 	pop	%rbp
   1488 	pop	%rbx
   1489 	pop	%rdi
   1490 	pop	%rsi
   1491 	ret
   1492 .size	se_handler,.-se_handler
   1493 
   1494 .section	.pdata
   1495 .align	4
   1496 	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
   1497 	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
   1498 	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
   1499 
   1500 	.rva	.LSEH_begin_${PREFIX}_set_decrypt_key
   1501 	.rva	.LSEH_end_${PREFIX}_set_decrypt_key
   1502 	.rva	.LSEH_info_${PREFIX}_set_decrypt_key
   1503 
   1504 	.rva	.LSEH_begin_${PREFIX}_encrypt
   1505 	.rva	.LSEH_end_${PREFIX}_encrypt
   1506 	.rva	.LSEH_info_${PREFIX}_encrypt
   1507 
   1508 	.rva	.LSEH_begin_${PREFIX}_decrypt
   1509 	.rva	.LSEH_end_${PREFIX}_decrypt
   1510 	.rva	.LSEH_info_${PREFIX}_decrypt
   1511 
   1512 	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
   1513 	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
   1514 	.rva	.LSEH_info_${PREFIX}_cbc_encrypt
   1515 
   1516 	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
   1517 	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
   1518 	.rva	.LSEH_info_${PREFIX}_ctr32_encrypt_blocks
   1519 
   1520 .section	.xdata
   1521 .align	8
   1522 .LSEH_info_${PREFIX}_set_encrypt_key:
   1523 	.byte	9,0,0,0
   1524 	.rva	se_handler
   1525 	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
   1526 .LSEH_info_${PREFIX}_set_decrypt_key:
   1527 	.byte	9,0,0,0
   1528 	.rva	se_handler
   1529 	.rva	.Ldec_key_body,.Ldec_key_epilogue	# HandlerData[]
   1530 .LSEH_info_${PREFIX}_encrypt:
   1531 	.byte	9,0,0,0
   1532 	.rva	se_handler
   1533 	.rva	.Lenc_body,.Lenc_epilogue		# HandlerData[]
   1534 .LSEH_info_${PREFIX}_decrypt:
   1535 	.byte	9,0,0,0
   1536 	.rva	se_handler
   1537 	.rva	.Ldec_body,.Ldec_epilogue		# HandlerData[]
   1538 .LSEH_info_${PREFIX}_cbc_encrypt:
   1539 	.byte	9,0,0,0
   1540 	.rva	se_handler
   1541 	.rva	.Lcbc_body,.Lcbc_epilogue		# HandlerData[]
   1542 .LSEH_info_${PREFIX}_ctr32_encrypt_blocks:
   1543 	.byte	9,0,0,0
   1544 	.rva	se_handler
   1545 	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
   1546 ___
   1547 }
   1548 
   1549 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   1550 
   1551 print $code;
   1552 
   1553 close STDOUT;
   1554