Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 ###################################################################
      4 ### AES-128 [originally in CTR mode]				###
      5 ### bitsliced implementation for Intel Core 2 processors	###
      6 ### requires support of SSE extensions up to SSSE3		###
      7 ### Author: Emilia Ksper and Peter Schwabe			###
      8 ### Date: 2009-03-19						###
      9 ### Public domain						###
     10 ###								###
     11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
     12 ### further information.					###
     13 ###################################################################
     14 #
     15 # September 2011.
     16 #
     17 # Started as transliteration to "perlasm" the original code has
     18 # undergone following changes:
     19 #
     20 # - code was made position-independent;
     21 # - rounds were folded into a loop resulting in >5x size reduction
     22 #   from 12.5KB to 2.2KB;
     23 # - above was possibile thanks to mixcolumns() modification that
     24 #   allowed to feed its output back to aesenc[last], this was
     25 #   achieved at cost of two additional inter-registers moves;
     26 # - some instruction reordering and interleaving;
     27 # - this module doesn't implement key setup subroutine, instead it
     28 #   relies on conversion of "conventional" key schedule as returned
     29 #   by AES_set_encrypt_key (see discussion below);
     30 # - first and last round keys are treated differently, which allowed
     31 #   to skip one shiftrows(), reduce bit-sliced key schedule and
     32 #   speed-up conversion by 22%;
     33 # - support for 192- and 256-bit keys was added;
     34 #
     35 # Resulting performance in CPU cycles spent to encrypt one byte out
     36 # of 4096-byte buffer with 128-bit key is:
     37 #
     38 #		Emilia's	this(*)		difference
     39 #
     40 # Core 2    	9.30		8.69		+7%
     41 # Nehalem(**) 	7.63		6.88		+11%
     42 # Atom	    	17.1		16.4		+4%
     43 # Silvermont	-		12.9
     44 # Goldmont	-		8.85
     45 #
     46 # (*)	Comparison is not completely fair, because "this" is ECB,
     47 #	i.e. no extra processing such as counter values calculation
     48 #	and xor-ing input as in Emilia's CTR implementation is
     49 #	performed. However, the CTR calculations stand for not more
     50 #	than 1% of total time, so comparison is *rather* fair.
     51 #
     52 # (**)	Results were collected on Westmere, which is considered to
     53 #	be equivalent to Nehalem for this code.
     54 #
     55 # As for key schedule conversion subroutine. Interface to OpenSSL
     56 # relies on per-invocation on-the-fly conversion. This naturally
     57 # has impact on performance, especially for short inputs. Conversion
     58 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
     59 # function is:
     60 #
     61 # 		conversion	conversion/8x block
     62 # Core 2	240		0.22
     63 # Nehalem	180		0.20
     64 # Atom		430		0.20
     65 #
     66 # The ratio values mean that 128-byte blocks will be processed
     67 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
     68 # etc. Then keep in mind that input sizes not divisible by 128 are
     69 # *effectively* slower, especially shortest ones, e.g. consecutive
     70 # 144-byte blocks are processed 44% slower than one would expect,
     71 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
     72 # it's still faster than ["hyper-threading-safe" code path in]
     73 # aes-x86_64.pl on all lengths above 64 bytes...
     74 #
     75 # October 2011.
     76 #
     77 # Add decryption procedure. Performance in CPU cycles spent to decrypt
     78 # one byte out of 4096-byte buffer with 128-bit key is:
     79 #
     80 # Core 2	9.98
     81 # Nehalem	7.80
     82 # Atom		17.9
     83 # Silvermont	14.0
     84 # Goldmont	10.2
     85 #
     86 # November 2011.
     87 #
     88 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
     89 # suboptimal, but XTS is meant to be used with larger blocks...
     90 #
     91 #						<appro (at] openssl.org>
     92 
     93 $flavour = shift;
     94 $output  = shift;
     95 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     96 
     97 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     98 
     99 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    100 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    101 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    102 die "can't locate x86_64-xlate.pl";
    103 
    104 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
    105 *STDOUT=*OUT;
    106 
    107 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
    108 my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
    109 my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
    110 
    111 {
    112 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
    113 
    114 sub Sbox {
    115 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    116 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
    117 my @b=@_[0..7];
    118 my @t=@_[8..11];
    119 my @s=@_[12..15];
    120 	&InBasisChange	(@b);
    121 	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
    122 	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
    123 }
    124 
    125 sub InBasisChange {
    126 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    127 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
    128 my @b=@_[0..7];
    129 $code.=<<___;
    130 	pxor	@b[6], @b[5]
    131 	pxor	@b[1], @b[2]
    132 	pxor	@b[0], @b[3]
    133 	pxor	@b[2], @b[6]
    134 	pxor 	@b[0], @b[5]
    135 
    136 	pxor	@b[3], @b[6]
    137 	pxor	@b[7], @b[3]
    138 	pxor	@b[5], @b[7]
    139 	pxor	@b[4], @b[3]
    140 	pxor	@b[5], @b[4]
    141 	pxor	@b[1], @b[3]
    142 
    143 	pxor	@b[7], @b[2]
    144 	pxor	@b[5], @b[1]
    145 ___
    146 }
    147 
    148 sub OutBasisChange {
    149 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    150 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
    151 my @b=@_[0..7];
    152 $code.=<<___;
    153 	pxor	@b[6], @b[0]
    154 	pxor	@b[4], @b[1]
    155 	pxor	@b[0], @b[2]
    156 	pxor	@b[6], @b[4]
    157 	pxor	@b[1], @b[6]
    158 
    159 	pxor	@b[5], @b[1]
    160 	pxor	@b[3], @b[5]
    161 	pxor	@b[7], @b[3]
    162 	pxor	@b[5], @b[7]
    163 	pxor	@b[5], @b[2]
    164 
    165 	pxor	@b[7], @b[4]
    166 ___
    167 }
    168 
    169 sub InvSbox {
    170 # input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    171 # output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
    172 my @b=@_[0..7];
    173 my @t=@_[8..11];
    174 my @s=@_[12..15];
    175 	&InvInBasisChange	(@b);
    176 	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
    177 	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
    178 }
    179 
    180 sub InvInBasisChange {		# OutBasisChange in reverse
    181 my @b=@_[5,1,2,6,3,7,0,4];
    182 $code.=<<___
    183 	pxor	@b[7], @b[4]
    184 
    185 	pxor	@b[5], @b[7]
    186 	pxor	@b[5], @b[2]
    187 	pxor	@b[7], @b[3]
    188 	pxor	@b[3], @b[5]
    189 	pxor	@b[5], @b[1]
    190 
    191 	pxor	@b[1], @b[6]
    192 	pxor	@b[0], @b[2]
    193 	pxor	@b[6], @b[4]
    194 	pxor	@b[6], @b[0]
    195 	pxor	@b[4], @b[1]
    196 ___
    197 }
    198 
    199 sub InvOutBasisChange {		# InBasisChange in reverse
    200 my @b=@_[2,5,7,3,6,1,0,4];
    201 $code.=<<___;
    202 	pxor	@b[5], @b[1]
    203 	pxor	@b[7], @b[2]
    204 
    205 	pxor	@b[1], @b[3]
    206 	pxor	@b[5], @b[4]
    207 	pxor	@b[5], @b[7]
    208 	pxor	@b[4], @b[3]
    209 	 pxor 	@b[0], @b[5]
    210 	pxor	@b[7], @b[3]
    211 	 pxor	@b[2], @b[6]
    212 	 pxor	@b[1], @b[2]
    213 	pxor	@b[3], @b[6]
    214 
    215 	pxor	@b[0], @b[3]
    216 	pxor	@b[6], @b[5]
    217 ___
    218 }
    219 
    220 sub Mul_GF4 {
    221 #;*************************************************************
    222 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
    223 #;*************************************************************
    224 my ($x0,$x1,$y0,$y1,$t0)=@_;
    225 $code.=<<___;
    226 	movdqa	$y0, $t0
    227 	pxor 	$y1, $t0
    228 	pand	$x0, $t0
    229 	pxor	$x1, $x0
    230 	pand	$y0, $x1
    231 	pand	$y1, $x0
    232 	pxor	$x1, $x0
    233 	pxor	$t0, $x1
    234 ___
    235 }
    236 
    237 sub Mul_GF4_N {				# not used, see next subroutine
    238 # multiply and scale by N
    239 my ($x0,$x1,$y0,$y1,$t0)=@_;
    240 $code.=<<___;
    241 	movdqa	$y0, $t0
    242 	pxor	$y1, $t0
    243 	pand	$x0, $t0
    244 	pxor	$x1, $x0
    245 	pand	$y0, $x1
    246 	pand	$y1, $x0
    247 	pxor	$x0, $x1
    248 	pxor	$t0, $x0
    249 ___
    250 }
    251 
    252 sub Mul_GF4_N_GF4 {
    253 # interleaved Mul_GF4_N and Mul_GF4
    254 my ($x0,$x1,$y0,$y1,$t0,
    255     $x2,$x3,$y2,$y3,$t1)=@_;
    256 $code.=<<___;
    257 	movdqa	$y0, $t0
    258 	 movdqa	$y2, $t1
    259 	pxor	$y1, $t0
    260 	 pxor 	$y3, $t1
    261 	pand	$x0, $t0
    262 	 pand	$x2, $t1
    263 	pxor	$x1, $x0
    264 	 pxor	$x3, $x2
    265 	pand	$y0, $x1
    266 	 pand	$y2, $x3
    267 	pand	$y1, $x0
    268 	 pand	$y3, $x2
    269 	pxor	$x0, $x1
    270 	 pxor	$x3, $x2
    271 	pxor	$t0, $x0
    272 	 pxor	$t1, $x3
    273 ___
    274 }
    275 sub Mul_GF16_2 {
    276 my @x=@_[0..7];
    277 my @y=@_[8..11];
    278 my @t=@_[12..15];
    279 $code.=<<___;
    280 	movdqa	@x[0], @t[0]
    281 	movdqa	@x[1], @t[1]
    282 ___
    283 	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
    284 $code.=<<___;
    285 	pxor	@x[2], @t[0]
    286 	pxor	@x[3], @t[1]
    287 	pxor	@y[2], @y[0]
    288 	pxor	@y[3], @y[1]
    289 ___
    290 	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
    291 			 @x[2], @x[3], @y[2], @y[3], @t[2]);
    292 $code.=<<___;
    293 	pxor	@t[0], @x[0]
    294 	pxor	@t[0], @x[2]
    295 	pxor	@t[1], @x[1]
    296 	pxor	@t[1], @x[3]
    297 
    298 	movdqa	@x[4], @t[0]
    299 	movdqa	@x[5], @t[1]
    300 	pxor	@x[6], @t[0]
    301 	pxor	@x[7], @t[1]
    302 ___
    303 	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
    304 			 @x[6], @x[7], @y[2], @y[3], @t[2]);
    305 $code.=<<___;
    306 	pxor	@y[2], @y[0]
    307 	pxor	@y[3], @y[1]
    308 ___
    309 	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
    310 $code.=<<___;
    311 	pxor	@t[0], @x[4]
    312 	pxor	@t[0], @x[6]
    313 	pxor	@t[1], @x[5]
    314 	pxor	@t[1], @x[7]
    315 ___
    316 }
    317 sub Inv_GF256 {
    318 #;********************************************************************
    319 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
    320 #;********************************************************************
    321 my @x=@_[0..7];
    322 my @t=@_[8..11];
    323 my @s=@_[12..15];
    324 # direct optimizations from hardware
    325 $code.=<<___;
    326 	movdqa	@x[4], @t[3]
    327 	movdqa	@x[5], @t[2]
    328 	movdqa	@x[1], @t[1]
    329 	movdqa	@x[7], @s[1]
    330 	movdqa	@x[0], @s[0]
    331 
    332 	pxor	@x[6], @t[3]
    333 	pxor	@x[7], @t[2]
    334 	pxor	@x[3], @t[1]
    335 	 movdqa	@t[3], @s[2]
    336 	pxor	@x[6], @s[1]
    337 	 movdqa	@t[2], @t[0]
    338 	pxor	@x[2], @s[0]
    339 	 movdqa	@t[3], @s[3]
    340 
    341 	por	@t[1], @t[2]
    342 	por	@s[0], @t[3]
    343 	pxor	@t[0], @s[3]
    344 	pand	@s[0], @s[2]
    345 	pxor	@t[1], @s[0]
    346 	pand	@t[1], @t[0]
    347 	pand	@s[0], @s[3]
    348 	movdqa	@x[3], @s[0]
    349 	pxor	@x[2], @s[0]
    350 	pand	@s[0], @s[1]
    351 	pxor	@s[1], @t[3]
    352 	pxor	@s[1], @t[2]
    353 	movdqa	@x[4], @s[1]
    354 	movdqa	@x[1], @s[0]
    355 	pxor	@x[5], @s[1]
    356 	pxor	@x[0], @s[0]
    357 	movdqa	@s[1], @t[1]
    358 	pand	@s[0], @s[1]
    359 	por	@s[0], @t[1]
    360 	pxor	@s[1], @t[0]
    361 	pxor	@s[3], @t[3]
    362 	pxor	@s[2], @t[2]
    363 	pxor	@s[3], @t[1]
    364 	movdqa	@x[7], @s[0]
    365 	pxor	@s[2], @t[0]
    366 	movdqa	@x[6], @s[1]
    367 	pxor	@s[2], @t[1]
    368 	movdqa	@x[5], @s[2]
    369 	pand	@x[3], @s[0]
    370 	movdqa	@x[4], @s[3]
    371 	pand	@x[2], @s[1]
    372 	pand	@x[1], @s[2]
    373 	por	@x[0], @s[3]
    374 	pxor	@s[0], @t[3]
    375 	pxor	@s[1], @t[2]
    376 	pxor	@s[2], @t[1]
    377 	pxor	@s[3], @t[0]
    378 
    379 	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
    380 
    381 	# new smaller inversion
    382 
    383 	movdqa	@t[3], @s[0]
    384 	pand	@t[1], @t[3]
    385 	pxor	@t[2], @s[0]
    386 
    387 	movdqa	@t[0], @s[2]
    388 	movdqa	@s[0], @s[3]
    389 	pxor	@t[3], @s[2]
    390 	pand	@s[2], @s[3]
    391 
    392 	movdqa	@t[1], @s[1]
    393 	pxor	@t[2], @s[3]
    394 	pxor	@t[0], @s[1]
    395 
    396 	pxor	@t[2], @t[3]
    397 
    398 	pand	@t[3], @s[1]
    399 
    400 	movdqa	@s[2], @t[2]
    401 	pxor	@t[0], @s[1]
    402 
    403 	pxor	@s[1], @t[2]
    404 	pxor	@s[1], @t[1]
    405 
    406 	pand	@t[0], @t[2]
    407 
    408 	pxor	@t[2], @s[2]
    409 	pxor	@t[2], @t[1]
    410 
    411 	pand	@s[3], @s[2]
    412 
    413 	pxor	@s[0], @s[2]
    414 ___
    415 # output in s3, s2, s1, t1
    416 
    417 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
    418 
    419 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
    420 	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
    421 
    422 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
    423 }
    424 
    425 # AES linear components
    426 
    427 sub ShiftRows {
    428 my @x=@_[0..7];
    429 my $mask=pop;
    430 $code.=<<___;
    431 	pxor	0x00($key),@x[0]
    432 	pxor	0x10($key),@x[1]
    433 	pxor	0x20($key),@x[2]
    434 	pxor	0x30($key),@x[3]
    435 	pshufb	$mask,@x[0]
    436 	pshufb	$mask,@x[1]
    437 	pxor	0x40($key),@x[4]
    438 	pxor	0x50($key),@x[5]
    439 	pshufb	$mask,@x[2]
    440 	pshufb	$mask,@x[3]
    441 	pxor	0x60($key),@x[6]
    442 	pxor	0x70($key),@x[7]
    443 	pshufb	$mask,@x[4]
    444 	pshufb	$mask,@x[5]
    445 	pshufb	$mask,@x[6]
    446 	pshufb	$mask,@x[7]
    447 	lea	0x80($key),$key
    448 ___
    449 }
    450 
    451 sub MixColumns {
    452 # modified to emit output in order suitable for feeding back to aesenc[last]
    453 my @x=@_[0..7];
    454 my @t=@_[8..15];
    455 my $inv=@_[16];	# optional
    456 $code.=<<___;
    457 	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
    458 	pshufd	\$0x93, @x[1], @t[1]
    459 	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
    460 	pshufd	\$0x93, @x[2], @t[2]
    461 	 pxor	@t[1], @x[1]
    462 	pshufd	\$0x93, @x[3], @t[3]
    463 	 pxor	@t[2], @x[2]
    464 	pshufd	\$0x93, @x[4], @t[4]
    465 	 pxor	@t[3], @x[3]
    466 	pshufd	\$0x93, @x[5], @t[5]
    467 	 pxor	@t[4], @x[4]
    468 	pshufd	\$0x93, @x[6], @t[6]
    469 	 pxor	@t[5], @x[5]
    470 	pshufd	\$0x93, @x[7], @t[7]
    471 	 pxor	@t[6], @x[6]
    472 	 pxor	@t[7], @x[7]
    473 
    474 	pxor	@x[0], @t[1]
    475 	pxor	@x[7], @t[0]
    476 	pxor	@x[7], @t[1]
    477 	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
    478 	pxor	@x[1], @t[2]
    479 	 pshufd	\$0x4E, @x[1], @x[1]
    480 	pxor	@x[4], @t[5]
    481 	 pxor	@t[0], @x[0]
    482 	pxor	@x[5], @t[6]
    483 	 pxor	@t[1], @x[1]
    484 	pxor	@x[3], @t[4]
    485 	 pshufd	\$0x4E, @x[4], @t[0]
    486 	pxor	@x[6], @t[7]
    487 	 pshufd	\$0x4E, @x[5], @t[1]
    488 	pxor	@x[2], @t[3]
    489 	 pshufd	\$0x4E, @x[3], @x[4]
    490 	pxor	@x[7], @t[3]
    491 	 pshufd	\$0x4E, @x[7], @x[5]
    492 	pxor	@x[7], @t[4]
    493 	 pshufd	\$0x4E, @x[6], @x[3]
    494 	pxor	@t[4], @t[0]
    495 	 pshufd	\$0x4E, @x[2], @x[6]
    496 	pxor	@t[5], @t[1]
    497 ___
    498 $code.=<<___ if (!$inv);
    499 	pxor	@t[3], @x[4]
    500 	pxor	@t[7], @x[5]
    501 	pxor	@t[6], @x[3]
    502 	 movdqa	@t[0], @x[2]
    503 	pxor	@t[2], @x[6]
    504 	 movdqa	@t[1], @x[7]
    505 ___
    506 $code.=<<___ if ($inv);
    507 	pxor	@x[4], @t[3]
    508 	pxor	@t[7], @x[5]
    509 	pxor	@x[3], @t[6]
    510 	 movdqa	@t[0], @x[3]
    511 	pxor	@t[2], @x[6]
    512 	 movdqa	@t[6], @x[2]
    513 	 movdqa	@t[1], @x[7]
    514 	 movdqa	@x[6], @x[4]
    515 	 movdqa	@t[3], @x[6]
    516 ___
    517 }
    518 
    519 sub InvMixColumns_orig {
    520 my @x=@_[0..7];
    521 my @t=@_[8..15];
    522 
    523 $code.=<<___;
    524 	# multiplication by 0x0e
    525 	pshufd	\$0x93, @x[7], @t[7]
    526 	movdqa	@x[2], @t[2]
    527 	pxor	@x[5], @x[7]		# 7 5
    528 	pxor	@x[5], @x[2]		# 2 5
    529 	pshufd	\$0x93, @x[0], @t[0]
    530 	movdqa	@x[5], @t[5]
    531 	pxor	@x[0], @x[5]		# 5 0		[1]
    532 	pxor	@x[1], @x[0]		# 0 1
    533 	pshufd	\$0x93, @x[1], @t[1]
    534 	pxor	@x[2], @x[1]		# 1 25
    535 	pxor	@x[6], @x[0]		# 01 6		[2]
    536 	pxor	@x[3], @x[1]		# 125 3		[4]
    537 	pshufd	\$0x93, @x[3], @t[3]
    538 	pxor	@x[0], @x[2]		# 25 016	[3]
    539 	pxor	@x[7], @x[3]		# 3 75
    540 	pxor	@x[6], @x[7]		# 75 6		[0]
    541 	pshufd	\$0x93, @x[6], @t[6]
    542 	movdqa	@x[4], @t[4]
    543 	pxor	@x[4], @x[6]		# 6 4
    544 	pxor	@x[3], @x[4]		# 4 375		[6]
    545 	pxor	@x[7], @x[3]		# 375 756=36
    546 	pxor	@t[5], @x[6]		# 64 5		[7]
    547 	pxor	@t[2], @x[3]		# 36 2
    548 	pxor	@t[4], @x[3]		# 362 4		[5]
    549 	pshufd	\$0x93, @t[5], @t[5]
    550 ___
    551 					my @y = @x[7,5,0,2,1,3,4,6];
    552 $code.=<<___;
    553 	# multiplication by 0x0b
    554 	pxor	@y[0], @y[1]
    555 	pxor	@t[0], @y[0]
    556 	pxor	@t[1], @y[1]
    557 	pshufd	\$0x93, @t[2], @t[2]
    558 	pxor	@t[5], @y[0]
    559 	pxor	@t[6], @y[1]
    560 	pxor	@t[7], @y[0]
    561 	pshufd	\$0x93, @t[4], @t[4]
    562 	pxor	@t[6], @t[7]		# clobber t[7]
    563 	pxor	@y[0], @y[1]
    564 
    565 	pxor	@t[0], @y[3]
    566 	pshufd	\$0x93, @t[0], @t[0]
    567 	pxor	@t[1], @y[2]
    568 	pxor	@t[1], @y[4]
    569 	pxor	@t[2], @y[2]
    570 	pshufd	\$0x93, @t[1], @t[1]
    571 	pxor	@t[2], @y[3]
    572 	pxor	@t[2], @y[5]
    573 	pxor	@t[7], @y[2]
    574 	pshufd	\$0x93, @t[2], @t[2]
    575 	pxor	@t[3], @y[3]
    576 	pxor	@t[3], @y[6]
    577 	pxor	@t[3], @y[4]
    578 	pshufd	\$0x93, @t[3], @t[3]
    579 	pxor	@t[4], @y[7]
    580 	pxor	@t[4], @y[5]
    581 	pxor	@t[7], @y[7]
    582 	pxor	@t[5], @y[3]
    583 	pxor	@t[4], @y[4]
    584 	pxor	@t[5], @t[7]		# clobber t[7] even more
    585 
    586 	pxor	@t[7], @y[5]
    587 	pshufd	\$0x93, @t[4], @t[4]
    588 	pxor	@t[7], @y[6]
    589 	pxor	@t[7], @y[4]
    590 
    591 	pxor	@t[5], @t[7]
    592 	pshufd	\$0x93, @t[5], @t[5]
    593 	pxor	@t[6], @t[7]		# restore t[7]
    594 
    595 	# multiplication by 0x0d
    596 	pxor	@y[7], @y[4]
    597 	pxor	@t[4], @y[7]
    598 	pshufd	\$0x93, @t[6], @t[6]
    599 	pxor	@t[0], @y[2]
    600 	pxor	@t[5], @y[7]
    601 	pxor	@t[2], @y[2]
    602 	pshufd	\$0x93, @t[7], @t[7]
    603 
    604 	pxor	@y[1], @y[3]
    605 	pxor	@t[1], @y[1]
    606 	pxor	@t[0], @y[0]
    607 	pxor	@t[0], @y[3]
    608 	pxor	@t[5], @y[1]
    609 	pxor	@t[5], @y[0]
    610 	pxor	@t[7], @y[1]
    611 	pshufd	\$0x93, @t[0], @t[0]
    612 	pxor	@t[6], @y[0]
    613 	pxor	@y[1], @y[3]
    614 	pxor	@t[1], @y[4]
    615 	pshufd	\$0x93, @t[1], @t[1]
    616 
    617 	pxor	@t[7], @y[7]
    618 	pxor	@t[2], @y[4]
    619 	pxor	@t[2], @y[5]
    620 	pshufd	\$0x93, @t[2], @t[2]
    621 	pxor	@t[6], @y[2]
    622 	pxor	@t[3], @t[6]		# clobber t[6]
    623 	pxor	@y[7], @y[4]
    624 	pxor	@t[6], @y[3]
    625 
    626 	pxor	@t[6], @y[6]
    627 	pxor	@t[5], @y[5]
    628 	pxor	@t[4], @y[6]
    629 	pshufd	\$0x93, @t[4], @t[4]
    630 	pxor	@t[6], @y[5]
    631 	pxor	@t[7], @y[6]
    632 	pxor	@t[3], @t[6]		# restore t[6]
    633 
    634 	pshufd	\$0x93, @t[5], @t[5]
    635 	pshufd	\$0x93, @t[6], @t[6]
    636 	pshufd	\$0x93, @t[7], @t[7]
    637 	pshufd	\$0x93, @t[3], @t[3]
    638 
    639 	# multiplication by 0x09
    640 	pxor	@y[1], @y[4]
    641 	pxor	@y[1], @t[1]		# t[1]=y[1]
    642 	pxor	@t[5], @t[0]		# clobber t[0]
    643 	pxor	@t[5], @t[1]
    644 	pxor	@t[0], @y[3]
    645 	pxor	@y[0], @t[0]		# t[0]=y[0]
    646 	pxor	@t[6], @t[1]
    647 	pxor	@t[7], @t[6]		# clobber t[6]
    648 	pxor	@t[1], @y[4]
    649 	pxor	@t[4], @y[7]
    650 	pxor	@y[4], @t[4]		# t[4]=y[4]
    651 	pxor	@t[3], @y[6]
    652 	pxor	@y[3], @t[3]		# t[3]=y[3]
    653 	pxor	@t[2], @y[5]
    654 	pxor	@y[2], @t[2]		# t[2]=y[2]
    655 	pxor	@t[7], @t[3]
    656 	pxor	@y[5], @t[5]		# t[5]=y[5]
    657 	pxor	@t[6], @t[2]
    658 	pxor	@t[6], @t[5]
    659 	pxor	@y[6], @t[6]		# t[6]=y[6]
    660 	pxor	@y[7], @t[7]		# t[7]=y[7]
    661 
    662 	movdqa	@t[0],@XMM[0]
    663 	movdqa	@t[1],@XMM[1]
    664 	movdqa	@t[2],@XMM[2]
    665 	movdqa	@t[3],@XMM[3]
    666 	movdqa	@t[4],@XMM[4]
    667 	movdqa	@t[5],@XMM[5]
    668 	movdqa	@t[6],@XMM[6]
    669 	movdqa	@t[7],@XMM[7]
    670 ___
    671 }
    672 
    673 sub InvMixColumns {
    674 my @x=@_[0..7];
    675 my @t=@_[8..15];
    676 
    677 # Thanks to Jussi Kivilinna for providing pointer to
    678 #
    679 # | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
    680 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
    681 # | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
    682 # | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
    683 
    684 $code.=<<___;
    685 	# multiplication by 0x05-0x00-0x04-0x00
    686 	pshufd	\$0x4E, @x[0], @t[0]
    687 	pshufd	\$0x4E, @x[6], @t[6]
    688 	pxor	@x[0], @t[0]
    689 	pshufd	\$0x4E, @x[7], @t[7]
    690 	pxor	@x[6], @t[6]
    691 	pshufd	\$0x4E, @x[1], @t[1]
    692 	pxor	@x[7], @t[7]
    693 	pshufd	\$0x4E, @x[2], @t[2]
    694 	pxor	@x[1], @t[1]
    695 	pshufd	\$0x4E, @x[3], @t[3]
    696 	pxor	@x[2], @t[2]
    697 	 pxor	@t[6], @x[0]
    698 	 pxor	@t[6], @x[1]
    699 	pshufd	\$0x4E, @x[4], @t[4]
    700 	pxor	@x[3], @t[3]
    701 	 pxor	@t[0], @x[2]
    702 	 pxor	@t[1], @x[3]
    703 	pshufd	\$0x4E, @x[5], @t[5]
    704 	pxor	@x[4], @t[4]
    705 	 pxor	@t[7], @x[1]
    706 	 pxor	@t[2], @x[4]
    707 	pxor	@x[5], @t[5]
    708 
    709 	 pxor	@t[7], @x[2]
    710 	 pxor	@t[6], @x[3]
    711 	 pxor	@t[6], @x[4]
    712 	 pxor	@t[3], @x[5]
    713 	 pxor	@t[4], @x[6]
    714 	 pxor	@t[7], @x[4]
    715 	 pxor	@t[7], @x[5]
    716 	 pxor	@t[5], @x[7]
    717 ___
    718 	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
    719 }
    720 
    721 sub aesenc {				# not used
    722 my @b=@_[0..7];
    723 my @t=@_[8..15];
    724 $code.=<<___;
    725 	movdqa	0x30($const),@t[0]	# .LSR
    726 ___
    727 	&ShiftRows	(@b,@t[0]);
    728 	&Sbox		(@b,@t);
    729 	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
    730 }
    731 
    732 sub aesenclast {			# not used
    733 my @b=@_[0..7];
    734 my @t=@_[8..15];
    735 $code.=<<___;
    736 	movdqa	0x40($const),@t[0]	# .LSRM0
    737 ___
    738 	&ShiftRows	(@b,@t[0]);
    739 	&Sbox		(@b,@t);
    740 $code.=<<___
    741 	pxor	0x00($key),@b[0]
    742 	pxor	0x10($key),@b[1]
    743 	pxor	0x20($key),@b[4]
    744 	pxor	0x30($key),@b[6]
    745 	pxor	0x40($key),@b[3]
    746 	pxor	0x50($key),@b[7]
    747 	pxor	0x60($key),@b[2]
    748 	pxor	0x70($key),@b[5]
    749 ___
    750 }
    751 
    752 sub swapmove {
    753 my ($a,$b,$n,$mask,$t)=@_;
    754 $code.=<<___;
    755 	movdqa	$b,$t
    756 	psrlq	\$$n,$b
    757 	pxor  	$a,$b
    758 	pand	$mask,$b
    759 	pxor	$b,$a
    760 	psllq	\$$n,$b
    761 	pxor	$t,$b
    762 ___
    763 }
    764 sub swapmove2x {
    765 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
    766 $code.=<<___;
    767 	movdqa	$b0,$t0
    768 	psrlq	\$$n,$b0
    769 	 movdqa	$b1,$t1
    770 	 psrlq	\$$n,$b1
    771 	pxor  	$a0,$b0
    772 	 pxor  	$a1,$b1
    773 	pand	$mask,$b0
    774 	 pand	$mask,$b1
    775 	pxor	$b0,$a0
    776 	psllq	\$$n,$b0
    777 	 pxor	$b1,$a1
    778 	 psllq	\$$n,$b1
    779 	pxor	$t0,$b0
    780 	 pxor	$t1,$b1
    781 ___
    782 }
    783 
    784 sub bitslice {
    785 my @x=reverse(@_[0..7]);
    786 my ($t0,$t1,$t2,$t3)=@_[8..11];
    787 $code.=<<___;
    788 	movdqa	0x00($const),$t0	# .LBS0
    789 	movdqa	0x10($const),$t1	# .LBS1
    790 ___
    791 	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
    792 	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
    793 $code.=<<___;
    794 	movdqa	0x20($const),$t0	# .LBS2
    795 ___
    796 	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
    797 	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
    798 
    799 	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
    800 	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
    801 }
    802 
    803 $code.=<<___;
    804 .text
    805 
    806 .extern	asm_AES_encrypt
    807 .extern	asm_AES_decrypt
    808 
    809 .type	_bsaes_encrypt8,\@abi-omnipotent
    810 .align	64
    811 _bsaes_encrypt8:
    812 	lea	.LBS0(%rip), $const	# constants table
    813 
    814 	movdqa	($key), @XMM[9]		# round 0 key
    815 	lea	0x10($key), $key
    816 	movdqa	0x50($const), @XMM[8]	# .LM0SR
    817 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
    818 	pxor	@XMM[9], @XMM[1]
    819 	pxor	@XMM[9], @XMM[2]
    820 	pxor	@XMM[9], @XMM[3]
    821 	 pshufb	@XMM[8], @XMM[0]
    822 	 pshufb	@XMM[8], @XMM[1]
    823 	pxor	@XMM[9], @XMM[4]
    824 	pxor	@XMM[9], @XMM[5]
    825 	 pshufb	@XMM[8], @XMM[2]
    826 	 pshufb	@XMM[8], @XMM[3]
    827 	pxor	@XMM[9], @XMM[6]
    828 	pxor	@XMM[9], @XMM[7]
    829 	 pshufb	@XMM[8], @XMM[4]
    830 	 pshufb	@XMM[8], @XMM[5]
    831 	 pshufb	@XMM[8], @XMM[6]
    832 	 pshufb	@XMM[8], @XMM[7]
    833 _bsaes_encrypt8_bitslice:
    834 ___
    835 	&bitslice	(@XMM[0..7, 8..11]);
    836 $code.=<<___;
    837 	dec	$rounds
    838 	jmp	.Lenc_sbox
    839 .align	16
    840 .Lenc_loop:
    841 ___
    842 	&ShiftRows	(@XMM[0..7, 8]);
    843 $code.=".Lenc_sbox:\n";
    844 	&Sbox		(@XMM[0..7, 8..15]);
    845 $code.=<<___;
    846 	dec	$rounds
    847 	jl	.Lenc_done
    848 ___
    849 	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
    850 $code.=<<___;
    851 	movdqa	0x30($const), @XMM[8]	# .LSR
    852 	jnz	.Lenc_loop
    853 	movdqa	0x40($const), @XMM[8]	# .LSRM0
    854 	jmp	.Lenc_loop
    855 .align	16
    856 .Lenc_done:
    857 ___
    858 	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
    859 	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
    860 $code.=<<___;
    861 	movdqa	($key), @XMM[8]		# last round key
    862 	pxor	@XMM[8], @XMM[4]
    863 	pxor	@XMM[8], @XMM[6]
    864 	pxor	@XMM[8], @XMM[3]
    865 	pxor	@XMM[8], @XMM[7]
    866 	pxor	@XMM[8], @XMM[2]
    867 	pxor	@XMM[8], @XMM[5]
    868 	pxor	@XMM[8], @XMM[0]
    869 	pxor	@XMM[8], @XMM[1]
    870 	ret
    871 .size	_bsaes_encrypt8,.-_bsaes_encrypt8
    872 
    873 .type	_bsaes_decrypt8,\@abi-omnipotent
    874 .align	64
    875 _bsaes_decrypt8:
    876 	lea	.LBS0(%rip), $const	# constants table
    877 
    878 	movdqa	($key), @XMM[9]		# round 0 key
    879 	lea	0x10($key), $key
    880 	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
    881 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
    882 	pxor	@XMM[9], @XMM[1]
    883 	pxor	@XMM[9], @XMM[2]
    884 	pxor	@XMM[9], @XMM[3]
    885 	 pshufb	@XMM[8], @XMM[0]
    886 	 pshufb	@XMM[8], @XMM[1]
    887 	pxor	@XMM[9], @XMM[4]
    888 	pxor	@XMM[9], @XMM[5]
    889 	 pshufb	@XMM[8], @XMM[2]
    890 	 pshufb	@XMM[8], @XMM[3]
    891 	pxor	@XMM[9], @XMM[6]
    892 	pxor	@XMM[9], @XMM[7]
    893 	 pshufb	@XMM[8], @XMM[4]
    894 	 pshufb	@XMM[8], @XMM[5]
    895 	 pshufb	@XMM[8], @XMM[6]
    896 	 pshufb	@XMM[8], @XMM[7]
    897 ___
    898 	&bitslice	(@XMM[0..7, 8..11]);
    899 $code.=<<___;
    900 	dec	$rounds
    901 	jmp	.Ldec_sbox
    902 .align	16
    903 .Ldec_loop:
    904 ___
    905 	&ShiftRows	(@XMM[0..7, 8]);
    906 $code.=".Ldec_sbox:\n";
    907 	&InvSbox	(@XMM[0..7, 8..15]);
    908 $code.=<<___;
    909 	dec	$rounds
    910 	jl	.Ldec_done
    911 ___
    912 	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
    913 $code.=<<___;
    914 	movdqa	-0x10($const), @XMM[8]	# .LISR
    915 	jnz	.Ldec_loop
    916 	movdqa	-0x20($const), @XMM[8]	# .LISRM0
    917 	jmp	.Ldec_loop
    918 .align	16
    919 .Ldec_done:
    920 ___
    921 	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
    922 $code.=<<___;
    923 	movdqa	($key), @XMM[8]		# last round key
    924 	pxor	@XMM[8], @XMM[6]
    925 	pxor	@XMM[8], @XMM[4]
    926 	pxor	@XMM[8], @XMM[2]
    927 	pxor	@XMM[8], @XMM[7]
    928 	pxor	@XMM[8], @XMM[3]
    929 	pxor	@XMM[8], @XMM[5]
    930 	pxor	@XMM[8], @XMM[0]
    931 	pxor	@XMM[8], @XMM[1]
    932 	ret
    933 .size	_bsaes_decrypt8,.-_bsaes_decrypt8
    934 ___
    935 }
    936 {
    937 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
    938 
    939 sub bitslice_key {
    940 my @x=reverse(@_[0..7]);
    941 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
    942 
    943 	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
    944 $code.=<<___;
    945 	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
    946 	movdqa	@x[0], @x[2]
    947 	movdqa	@x[1], @x[3]
    948 ___
    949 	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
    950 
    951 	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
    952 $code.=<<___;
    953 	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
    954 	movdqa	@x[0], @x[4]
    955 	movdqa	@x[2], @x[6]
    956 	movdqa	@x[1], @x[5]
    957 	movdqa	@x[3], @x[7]
    958 ___
    959 	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
    960 	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
    961 }
    962 
    963 $code.=<<___;
    964 .type	_bsaes_key_convert,\@abi-omnipotent
    965 .align	16
    966 _bsaes_key_convert:
    967 	lea	.Lmasks(%rip), $const
    968 	movdqu	($inp), %xmm7		# load round 0 key
    969 	lea	0x10($inp), $inp
    970 	movdqa	0x00($const), %xmm0	# 0x01...
    971 	movdqa	0x10($const), %xmm1	# 0x02...
    972 	movdqa	0x20($const), %xmm2	# 0x04...
    973 	movdqa	0x30($const), %xmm3	# 0x08...
    974 	movdqa	0x40($const), %xmm4	# .LM0
    975 	pcmpeqd	%xmm5, %xmm5		# .LNOT
    976 
    977 	movdqu	($inp), %xmm6		# load round 1 key
    978 	movdqa	%xmm7, ($out)		# save round 0 key
    979 	lea	0x10($out), $out
    980 	dec	$rounds
    981 	jmp	.Lkey_loop
    982 .align	16
    983 .Lkey_loop:
    984 	pshufb	%xmm4, %xmm6		# .LM0
    985 
    986 	movdqa	%xmm0,	%xmm8
    987 	movdqa	%xmm1,	%xmm9
    988 
    989 	pand	%xmm6,	%xmm8
    990 	pand	%xmm6,	%xmm9
    991 	movdqa	%xmm2,	%xmm10
    992 	pcmpeqb	%xmm0,	%xmm8
    993 	psllq	\$4,	%xmm0		# 0x10...
    994 	movdqa	%xmm3,	%xmm11
    995 	pcmpeqb	%xmm1,	%xmm9
    996 	psllq	\$4,	%xmm1		# 0x20...
    997 
    998 	pand	%xmm6,	%xmm10
    999 	pand	%xmm6,	%xmm11
   1000 	movdqa	%xmm0,	%xmm12
   1001 	pcmpeqb	%xmm2,	%xmm10
   1002 	psllq	\$4,	%xmm2		# 0x40...
   1003 	movdqa	%xmm1,	%xmm13
   1004 	pcmpeqb	%xmm3,	%xmm11
   1005 	psllq	\$4,	%xmm3		# 0x80...
   1006 
   1007 	movdqa	%xmm2,	%xmm14
   1008 	movdqa	%xmm3,	%xmm15
   1009 	 pxor	%xmm5,	%xmm8		# "pnot"
   1010 	 pxor	%xmm5,	%xmm9
   1011 
   1012 	pand	%xmm6,	%xmm12
   1013 	pand	%xmm6,	%xmm13
   1014 	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
   1015 	pcmpeqb	%xmm0,	%xmm12
   1016 	psrlq	\$4,	%xmm0		# 0x01...
   1017 	 movdqa	%xmm9, 0x10($out)
   1018 	pcmpeqb	%xmm1,	%xmm13
   1019 	psrlq	\$4,	%xmm1		# 0x02...
   1020 	 lea	0x10($inp), $inp
   1021 
   1022 	pand	%xmm6,	%xmm14
   1023 	pand	%xmm6,	%xmm15
   1024 	 movdqa	%xmm10, 0x20($out)
   1025 	pcmpeqb	%xmm2,	%xmm14
   1026 	psrlq	\$4,	%xmm2		# 0x04...
   1027 	 movdqa	%xmm11, 0x30($out)
   1028 	pcmpeqb	%xmm3,	%xmm15
   1029 	psrlq	\$4,	%xmm3		# 0x08...
   1030 	 movdqu	($inp), %xmm6		# load next round key
   1031 
   1032 	pxor	%xmm5, %xmm13		# "pnot"
   1033 	pxor	%xmm5, %xmm14
   1034 	movdqa	%xmm12, 0x40($out)
   1035 	movdqa	%xmm13, 0x50($out)
   1036 	movdqa	%xmm14, 0x60($out)
   1037 	movdqa	%xmm15, 0x70($out)
   1038 	lea	0x80($out),$out
   1039 	dec	$rounds
   1040 	jnz	.Lkey_loop
   1041 
   1042 	movdqa	0x50($const), %xmm7	# .L63
   1043 	#movdqa	%xmm6, ($out)		# don't save last round key
   1044 	ret
   1045 .size	_bsaes_key_convert,.-_bsaes_key_convert
   1046 ___
   1047 }
   1048 
   1049 if (0 && !$win64) {	# following four functions are unsupported interface
   1050 			# used for benchmarking...
   1051 $code.=<<___;
   1052 .globl	bsaes_enc_key_convert
   1053 .type	bsaes_enc_key_convert,\@function,2
   1054 .align	16
   1055 bsaes_enc_key_convert:
   1056 	mov	240($inp),%r10d		# pass rounds
   1057 	mov	$inp,%rcx		# pass key
   1058 	mov	$out,%rax		# pass key schedule
   1059 	call	_bsaes_key_convert
   1060 	pxor	%xmm6,%xmm7		# fix up last round key
   1061 	movdqa	%xmm7,(%rax)		# save last round key
   1062 	ret
   1063 .size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
   1064 
   1065 .globl	bsaes_encrypt_128
   1066 .type	bsaes_encrypt_128,\@function,4
   1067 .align	16
   1068 bsaes_encrypt_128:
   1069 .Lenc128_loop:
   1070 	movdqu	0x00($inp), @XMM[0]	# load input
   1071 	movdqu	0x10($inp), @XMM[1]
   1072 	movdqu	0x20($inp), @XMM[2]
   1073 	movdqu	0x30($inp), @XMM[3]
   1074 	movdqu	0x40($inp), @XMM[4]
   1075 	movdqu	0x50($inp), @XMM[5]
   1076 	movdqu	0x60($inp), @XMM[6]
   1077 	movdqu	0x70($inp), @XMM[7]
   1078 	mov	$key, %rax		# pass the $key
   1079 	lea	0x80($inp), $inp
   1080 	mov	\$10,%r10d
   1081 
   1082 	call	_bsaes_encrypt8
   1083 
   1084 	movdqu	@XMM[0], 0x00($out)	# write output
   1085 	movdqu	@XMM[1], 0x10($out)
   1086 	movdqu	@XMM[4], 0x20($out)
   1087 	movdqu	@XMM[6], 0x30($out)
   1088 	movdqu	@XMM[3], 0x40($out)
   1089 	movdqu	@XMM[7], 0x50($out)
   1090 	movdqu	@XMM[2], 0x60($out)
   1091 	movdqu	@XMM[5], 0x70($out)
   1092 	lea	0x80($out), $out
   1093 	sub	\$0x80,$len
   1094 	ja	.Lenc128_loop
   1095 	ret
   1096 .size	bsaes_encrypt_128,.-bsaes_encrypt_128
   1097 
   1098 .globl	bsaes_dec_key_convert
   1099 .type	bsaes_dec_key_convert,\@function,2
   1100 .align	16
   1101 bsaes_dec_key_convert:
   1102 	mov	240($inp),%r10d		# pass rounds
   1103 	mov	$inp,%rcx		# pass key
   1104 	mov	$out,%rax		# pass key schedule
   1105 	call	_bsaes_key_convert
   1106 	pxor	($out),%xmm7		# fix up round 0 key
   1107 	movdqa	%xmm6,(%rax)		# save last round key
   1108 	movdqa	%xmm7,($out)
   1109 	ret
   1110 .size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
   1111 
   1112 .globl	bsaes_decrypt_128
   1113 .type	bsaes_decrypt_128,\@function,4
   1114 .align	16
   1115 bsaes_decrypt_128:
   1116 .Ldec128_loop:
   1117 	movdqu	0x00($inp), @XMM[0]	# load input
   1118 	movdqu	0x10($inp), @XMM[1]
   1119 	movdqu	0x20($inp), @XMM[2]
   1120 	movdqu	0x30($inp), @XMM[3]
   1121 	movdqu	0x40($inp), @XMM[4]
   1122 	movdqu	0x50($inp), @XMM[5]
   1123 	movdqu	0x60($inp), @XMM[6]
   1124 	movdqu	0x70($inp), @XMM[7]
   1125 	mov	$key, %rax		# pass the $key
   1126 	lea	0x80($inp), $inp
   1127 	mov	\$10,%r10d
   1128 
   1129 	call	_bsaes_decrypt8
   1130 
   1131 	movdqu	@XMM[0], 0x00($out)	# write output
   1132 	movdqu	@XMM[1], 0x10($out)
   1133 	movdqu	@XMM[6], 0x20($out)
   1134 	movdqu	@XMM[4], 0x30($out)
   1135 	movdqu	@XMM[2], 0x40($out)
   1136 	movdqu	@XMM[7], 0x50($out)
   1137 	movdqu	@XMM[3], 0x60($out)
   1138 	movdqu	@XMM[5], 0x70($out)
   1139 	lea	0x80($out), $out
   1140 	sub	\$0x80,$len
   1141 	ja	.Ldec128_loop
   1142 	ret
   1143 .size	bsaes_decrypt_128,.-bsaes_decrypt_128
   1144 ___
   1145 }
   1146 {
   1147 ######################################################################
   1148 #
   1149 # OpenSSL interface
   1150 #
   1151 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
   1152 						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
   1153 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
   1154 
   1155 if ($ecb) {
   1156 $code.=<<___;
   1157 .globl	bsaes_ecb_encrypt_blocks
   1158 .type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
   1159 .align	16
   1160 bsaes_ecb_encrypt_blocks:
   1161 	mov	%rsp, %rax
   1162 .Lecb_enc_prologue:
   1163 	push	%rbp
   1164 	push	%rbx
   1165 	push	%r12
   1166 	push	%r13
   1167 	push	%r14
   1168 	push	%r15
   1169 	lea	-0x48(%rsp),%rsp
   1170 ___
   1171 $code.=<<___ if ($win64);
   1172 	lea	-0xa0(%rsp), %rsp
   1173 	movaps	%xmm6, 0x40(%rsp)
   1174 	movaps	%xmm7, 0x50(%rsp)
   1175 	movaps	%xmm8, 0x60(%rsp)
   1176 	movaps	%xmm9, 0x70(%rsp)
   1177 	movaps	%xmm10, 0x80(%rsp)
   1178 	movaps	%xmm11, 0x90(%rsp)
   1179 	movaps	%xmm12, 0xa0(%rsp)
   1180 	movaps	%xmm13, 0xb0(%rsp)
   1181 	movaps	%xmm14, 0xc0(%rsp)
   1182 	movaps	%xmm15, 0xd0(%rsp)
   1183 .Lecb_enc_body:
   1184 ___
   1185 $code.=<<___;
   1186 	mov	%rsp,%rbp		# backup %rsp
   1187 	mov	240($arg4),%eax		# rounds
   1188 	mov	$arg1,$inp		# backup arguments
   1189 	mov	$arg2,$out
   1190 	mov	$arg3,$len
   1191 	mov	$arg4,$key
   1192 	cmp	\$8,$arg3
   1193 	jb	.Lecb_enc_short
   1194 
   1195 	mov	%eax,%ebx		# backup rounds
   1196 	shl	\$7,%rax		# 128 bytes per inner round key
   1197 	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
   1198 	sub	%rax,%rsp
   1199 	mov	%rsp,%rax		# pass key schedule
   1200 	mov	$key,%rcx		# pass key
   1201 	mov	%ebx,%r10d		# pass rounds
   1202 	call	_bsaes_key_convert
   1203 	pxor	%xmm6,%xmm7		# fix up last round key
   1204 	movdqa	%xmm7,(%rax)		# save last round key
   1205 
   1206 	sub	\$8,$len
   1207 .Lecb_enc_loop:
   1208 	movdqu	0x00($inp), @XMM[0]	# load input
   1209 	movdqu	0x10($inp), @XMM[1]
   1210 	movdqu	0x20($inp), @XMM[2]
   1211 	movdqu	0x30($inp), @XMM[3]
   1212 	movdqu	0x40($inp), @XMM[4]
   1213 	movdqu	0x50($inp), @XMM[5]
   1214 	mov	%rsp, %rax		# pass key schedule
   1215 	movdqu	0x60($inp), @XMM[6]
   1216 	mov	%ebx,%r10d		# pass rounds
   1217 	movdqu	0x70($inp), @XMM[7]
   1218 	lea	0x80($inp), $inp
   1219 
   1220 	call	_bsaes_encrypt8
   1221 
   1222 	movdqu	@XMM[0], 0x00($out)	# write output
   1223 	movdqu	@XMM[1], 0x10($out)
   1224 	movdqu	@XMM[4], 0x20($out)
   1225 	movdqu	@XMM[6], 0x30($out)
   1226 	movdqu	@XMM[3], 0x40($out)
   1227 	movdqu	@XMM[7], 0x50($out)
   1228 	movdqu	@XMM[2], 0x60($out)
   1229 	movdqu	@XMM[5], 0x70($out)
   1230 	lea	0x80($out), $out
   1231 	sub	\$8,$len
   1232 	jnc	.Lecb_enc_loop
   1233 
   1234 	add	\$8,$len
   1235 	jz	.Lecb_enc_done
   1236 
   1237 	movdqu	0x00($inp), @XMM[0]	# load input
   1238 	mov	%rsp, %rax		# pass key schedule
   1239 	mov	%ebx,%r10d		# pass rounds
   1240 	cmp	\$2,$len
   1241 	jb	.Lecb_enc_one
   1242 	movdqu	0x10($inp), @XMM[1]
   1243 	je	.Lecb_enc_two
   1244 	movdqu	0x20($inp), @XMM[2]
   1245 	cmp	\$4,$len
   1246 	jb	.Lecb_enc_three
   1247 	movdqu	0x30($inp), @XMM[3]
   1248 	je	.Lecb_enc_four
   1249 	movdqu	0x40($inp), @XMM[4]
   1250 	cmp	\$6,$len
   1251 	jb	.Lecb_enc_five
   1252 	movdqu	0x50($inp), @XMM[5]
   1253 	je	.Lecb_enc_six
   1254 	movdqu	0x60($inp), @XMM[6]
   1255 	call	_bsaes_encrypt8
   1256 	movdqu	@XMM[0], 0x00($out)	# write output
   1257 	movdqu	@XMM[1], 0x10($out)
   1258 	movdqu	@XMM[4], 0x20($out)
   1259 	movdqu	@XMM[6], 0x30($out)
   1260 	movdqu	@XMM[3], 0x40($out)
   1261 	movdqu	@XMM[7], 0x50($out)
   1262 	movdqu	@XMM[2], 0x60($out)
   1263 	jmp	.Lecb_enc_done
   1264 .align	16
   1265 .Lecb_enc_six:
   1266 	call	_bsaes_encrypt8
   1267 	movdqu	@XMM[0], 0x00($out)	# write output
   1268 	movdqu	@XMM[1], 0x10($out)
   1269 	movdqu	@XMM[4], 0x20($out)
   1270 	movdqu	@XMM[6], 0x30($out)
   1271 	movdqu	@XMM[3], 0x40($out)
   1272 	movdqu	@XMM[7], 0x50($out)
   1273 	jmp	.Lecb_enc_done
   1274 .align	16
   1275 .Lecb_enc_five:
   1276 	call	_bsaes_encrypt8
   1277 	movdqu	@XMM[0], 0x00($out)	# write output
   1278 	movdqu	@XMM[1], 0x10($out)
   1279 	movdqu	@XMM[4], 0x20($out)
   1280 	movdqu	@XMM[6], 0x30($out)
   1281 	movdqu	@XMM[3], 0x40($out)
   1282 	jmp	.Lecb_enc_done
   1283 .align	16
   1284 .Lecb_enc_four:
   1285 	call	_bsaes_encrypt8
   1286 	movdqu	@XMM[0], 0x00($out)	# write output
   1287 	movdqu	@XMM[1], 0x10($out)
   1288 	movdqu	@XMM[4], 0x20($out)
   1289 	movdqu	@XMM[6], 0x30($out)
   1290 	jmp	.Lecb_enc_done
   1291 .align	16
   1292 .Lecb_enc_three:
   1293 	call	_bsaes_encrypt8
   1294 	movdqu	@XMM[0], 0x00($out)	# write output
   1295 	movdqu	@XMM[1], 0x10($out)
   1296 	movdqu	@XMM[4], 0x20($out)
   1297 	jmp	.Lecb_enc_done
   1298 .align	16
   1299 .Lecb_enc_two:
   1300 	call	_bsaes_encrypt8
   1301 	movdqu	@XMM[0], 0x00($out)	# write output
   1302 	movdqu	@XMM[1], 0x10($out)
   1303 	jmp	.Lecb_enc_done
   1304 .align	16
   1305 .Lecb_enc_one:
   1306 	call	_bsaes_encrypt8
   1307 	movdqu	@XMM[0], 0x00($out)	# write output
   1308 	jmp	.Lecb_enc_done
   1309 .align	16
   1310 .Lecb_enc_short:
   1311 	lea	($inp), $arg1
   1312 	lea	($out), $arg2
   1313 	lea	($key), $arg3
   1314 	call	asm_AES_encrypt
   1315 	lea	16($inp), $inp
   1316 	lea	16($out), $out
   1317 	dec	$len
   1318 	jnz	.Lecb_enc_short
   1319 
   1320 .Lecb_enc_done:
   1321 	lea	(%rsp),%rax
   1322 	pxor	%xmm0, %xmm0
   1323 .Lecb_enc_bzero:			# wipe key schedule [if any]
   1324 	movdqa	%xmm0, 0x00(%rax)
   1325 	movdqa	%xmm0, 0x10(%rax)
   1326 	lea	0x20(%rax), %rax
   1327 	cmp	%rax, %rbp
   1328 	jb	.Lecb_enc_bzero
   1329 
   1330 	lea	0x78(%rbp),%rax
   1331 ___
   1332 $code.=<<___ if ($win64);
   1333 	movaps	0x40(%rbp), %xmm6
   1334 	movaps	0x50(%rbp), %xmm7
   1335 	movaps	0x60(%rbp), %xmm8
   1336 	movaps	0x70(%rbp), %xmm9
   1337 	movaps	0x80(%rbp), %xmm10
   1338 	movaps	0x90(%rbp), %xmm11
   1339 	movaps	0xa0(%rbp), %xmm12
   1340 	movaps	0xb0(%rbp), %xmm13
   1341 	movaps	0xc0(%rbp), %xmm14
   1342 	movaps	0xd0(%rbp), %xmm15
   1343 	lea	0xa0(%rax), %rax
   1344 .Lecb_enc_tail:
   1345 ___
   1346 $code.=<<___;
   1347 	mov	-48(%rax), %r15
   1348 	mov	-40(%rax), %r14
   1349 	mov	-32(%rax), %r13
   1350 	mov	-24(%rax), %r12
   1351 	mov	-16(%rax), %rbx
   1352 	mov	-8(%rax), %rbp
   1353 	lea	(%rax), %rsp		# restore %rsp
   1354 .Lecb_enc_epilogue:
   1355 	ret
   1356 .size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
   1357 
   1358 .globl	bsaes_ecb_decrypt_blocks
   1359 .type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
   1360 .align	16
   1361 bsaes_ecb_decrypt_blocks:
   1362 	mov	%rsp, %rax
   1363 .Lecb_dec_prologue:
   1364 	push	%rbp
   1365 	push	%rbx
   1366 	push	%r12
   1367 	push	%r13
   1368 	push	%r14
   1369 	push	%r15
   1370 	lea	-0x48(%rsp),%rsp
   1371 ___
   1372 $code.=<<___ if ($win64);
   1373 	lea	-0xa0(%rsp), %rsp
   1374 	movaps	%xmm6, 0x40(%rsp)
   1375 	movaps	%xmm7, 0x50(%rsp)
   1376 	movaps	%xmm8, 0x60(%rsp)
   1377 	movaps	%xmm9, 0x70(%rsp)
   1378 	movaps	%xmm10, 0x80(%rsp)
   1379 	movaps	%xmm11, 0x90(%rsp)
   1380 	movaps	%xmm12, 0xa0(%rsp)
   1381 	movaps	%xmm13, 0xb0(%rsp)
   1382 	movaps	%xmm14, 0xc0(%rsp)
   1383 	movaps	%xmm15, 0xd0(%rsp)
   1384 .Lecb_dec_body:
   1385 ___
   1386 $code.=<<___;
   1387 	mov	%rsp,%rbp		# backup %rsp
   1388 	mov	240($arg4),%eax		# rounds
   1389 	mov	$arg1,$inp		# backup arguments
   1390 	mov	$arg2,$out
   1391 	mov	$arg3,$len
   1392 	mov	$arg4,$key
   1393 	cmp	\$8,$arg3
   1394 	jb	.Lecb_dec_short
   1395 
   1396 	mov	%eax,%ebx		# backup rounds
   1397 	shl	\$7,%rax		# 128 bytes per inner round key
   1398 	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
   1399 	sub	%rax,%rsp
   1400 	mov	%rsp,%rax		# pass key schedule
   1401 	mov	$key,%rcx		# pass key
   1402 	mov	%ebx,%r10d		# pass rounds
   1403 	call	_bsaes_key_convert
   1404 	pxor	(%rsp),%xmm7		# fix up 0 round key
   1405 	movdqa	%xmm6,(%rax)		# save last round key
   1406 	movdqa	%xmm7,(%rsp)
   1407 
   1408 	sub	\$8,$len
   1409 .Lecb_dec_loop:
   1410 	movdqu	0x00($inp), @XMM[0]	# load input
   1411 	movdqu	0x10($inp), @XMM[1]
   1412 	movdqu	0x20($inp), @XMM[2]
   1413 	movdqu	0x30($inp), @XMM[3]
   1414 	movdqu	0x40($inp), @XMM[4]
   1415 	movdqu	0x50($inp), @XMM[5]
   1416 	mov	%rsp, %rax		# pass key schedule
   1417 	movdqu	0x60($inp), @XMM[6]
   1418 	mov	%ebx,%r10d		# pass rounds
   1419 	movdqu	0x70($inp), @XMM[7]
   1420 	lea	0x80($inp), $inp
   1421 
   1422 	call	_bsaes_decrypt8
   1423 
   1424 	movdqu	@XMM[0], 0x00($out)	# write output
   1425 	movdqu	@XMM[1], 0x10($out)
   1426 	movdqu	@XMM[6], 0x20($out)
   1427 	movdqu	@XMM[4], 0x30($out)
   1428 	movdqu	@XMM[2], 0x40($out)
   1429 	movdqu	@XMM[7], 0x50($out)
   1430 	movdqu	@XMM[3], 0x60($out)
   1431 	movdqu	@XMM[5], 0x70($out)
   1432 	lea	0x80($out), $out
   1433 	sub	\$8,$len
   1434 	jnc	.Lecb_dec_loop
   1435 
   1436 	add	\$8,$len
   1437 	jz	.Lecb_dec_done
   1438 
   1439 	movdqu	0x00($inp), @XMM[0]	# load input
   1440 	mov	%rsp, %rax		# pass key schedule
   1441 	mov	%ebx,%r10d		# pass rounds
   1442 	cmp	\$2,$len
   1443 	jb	.Lecb_dec_one
   1444 	movdqu	0x10($inp), @XMM[1]
   1445 	je	.Lecb_dec_two
   1446 	movdqu	0x20($inp), @XMM[2]
   1447 	cmp	\$4,$len
   1448 	jb	.Lecb_dec_three
   1449 	movdqu	0x30($inp), @XMM[3]
   1450 	je	.Lecb_dec_four
   1451 	movdqu	0x40($inp), @XMM[4]
   1452 	cmp	\$6,$len
   1453 	jb	.Lecb_dec_five
   1454 	movdqu	0x50($inp), @XMM[5]
   1455 	je	.Lecb_dec_six
   1456 	movdqu	0x60($inp), @XMM[6]
   1457 	call	_bsaes_decrypt8
   1458 	movdqu	@XMM[0], 0x00($out)	# write output
   1459 	movdqu	@XMM[1], 0x10($out)
   1460 	movdqu	@XMM[6], 0x20($out)
   1461 	movdqu	@XMM[4], 0x30($out)
   1462 	movdqu	@XMM[2], 0x40($out)
   1463 	movdqu	@XMM[7], 0x50($out)
   1464 	movdqu	@XMM[3], 0x60($out)
   1465 	jmp	.Lecb_dec_done
   1466 .align	16
   1467 .Lecb_dec_six:
   1468 	call	_bsaes_decrypt8
   1469 	movdqu	@XMM[0], 0x00($out)	# write output
   1470 	movdqu	@XMM[1], 0x10($out)
   1471 	movdqu	@XMM[6], 0x20($out)
   1472 	movdqu	@XMM[4], 0x30($out)
   1473 	movdqu	@XMM[2], 0x40($out)
   1474 	movdqu	@XMM[7], 0x50($out)
   1475 	jmp	.Lecb_dec_done
   1476 .align	16
   1477 .Lecb_dec_five:
   1478 	call	_bsaes_decrypt8
   1479 	movdqu	@XMM[0], 0x00($out)	# write output
   1480 	movdqu	@XMM[1], 0x10($out)
   1481 	movdqu	@XMM[6], 0x20($out)
   1482 	movdqu	@XMM[4], 0x30($out)
   1483 	movdqu	@XMM[2], 0x40($out)
   1484 	jmp	.Lecb_dec_done
   1485 .align	16
   1486 .Lecb_dec_four:
   1487 	call	_bsaes_decrypt8
   1488 	movdqu	@XMM[0], 0x00($out)	# write output
   1489 	movdqu	@XMM[1], 0x10($out)
   1490 	movdqu	@XMM[6], 0x20($out)
   1491 	movdqu	@XMM[4], 0x30($out)
   1492 	jmp	.Lecb_dec_done
   1493 .align	16
   1494 .Lecb_dec_three:
   1495 	call	_bsaes_decrypt8
   1496 	movdqu	@XMM[0], 0x00($out)	# write output
   1497 	movdqu	@XMM[1], 0x10($out)
   1498 	movdqu	@XMM[6], 0x20($out)
   1499 	jmp	.Lecb_dec_done
   1500 .align	16
   1501 .Lecb_dec_two:
   1502 	call	_bsaes_decrypt8
   1503 	movdqu	@XMM[0], 0x00($out)	# write output
   1504 	movdqu	@XMM[1], 0x10($out)
   1505 	jmp	.Lecb_dec_done
   1506 .align	16
   1507 .Lecb_dec_one:
   1508 	call	_bsaes_decrypt8
   1509 	movdqu	@XMM[0], 0x00($out)	# write output
   1510 	jmp	.Lecb_dec_done
   1511 .align	16
   1512 .Lecb_dec_short:
   1513 	lea	($inp), $arg1
   1514 	lea	($out), $arg2
   1515 	lea	($key), $arg3
   1516 	call	asm_AES_decrypt
   1517 	lea	16($inp), $inp
   1518 	lea	16($out), $out
   1519 	dec	$len
   1520 	jnz	.Lecb_dec_short
   1521 
   1522 .Lecb_dec_done:
   1523 	lea	(%rsp),%rax
   1524 	pxor	%xmm0, %xmm0
   1525 .Lecb_dec_bzero:			# wipe key schedule [if any]
   1526 	movdqa	%xmm0, 0x00(%rax)
   1527 	movdqa	%xmm0, 0x10(%rax)
   1528 	lea	0x20(%rax), %rax
   1529 	cmp	%rax, %rbp
   1530 	jb	.Lecb_dec_bzero
   1531 
   1532 	lea	0x78(%rbp),%rax
   1533 ___
   1534 $code.=<<___ if ($win64);
   1535 	movaps	0x40(%rbp), %xmm6
   1536 	movaps	0x50(%rbp), %xmm7
   1537 	movaps	0x60(%rbp), %xmm8
   1538 	movaps	0x70(%rbp), %xmm9
   1539 	movaps	0x80(%rbp), %xmm10
   1540 	movaps	0x90(%rbp), %xmm11
   1541 	movaps	0xa0(%rbp), %xmm12
   1542 	movaps	0xb0(%rbp), %xmm13
   1543 	movaps	0xc0(%rbp), %xmm14
   1544 	movaps	0xd0(%rbp), %xmm15
   1545 	lea	0xa0(%rax), %rax
   1546 .Lecb_dec_tail:
   1547 ___
   1548 $code.=<<___;
   1549 	mov	-48(%rax), %r15
   1550 	mov	-40(%rax), %r14
   1551 	mov	-32(%rax), %r13
   1552 	mov	-24(%rax), %r12
   1553 	mov	-16(%rax), %rbx
   1554 	mov	-8(%rax), %rbp
   1555 	lea	(%rax), %rsp		# restore %rsp
   1556 .Lecb_dec_epilogue:
   1557 	ret
   1558 .size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
   1559 ___
   1560 }
   1561 $code.=<<___;
   1562 .extern	asm_AES_cbc_encrypt
   1563 .globl	bsaes_cbc_encrypt
   1564 .type	bsaes_cbc_encrypt,\@abi-omnipotent
   1565 .align	16
   1566 bsaes_cbc_encrypt:
   1567 ___
   1568 $code.=<<___ if ($win64);
   1569 	mov	48(%rsp),$arg6		# pull direction flag
   1570 ___
   1571 $code.=<<___;
   1572 	cmp	\$0,$arg6
   1573 	jne	asm_AES_cbc_encrypt
   1574 	cmp	\$128,$arg3
   1575 	jb	asm_AES_cbc_encrypt
   1576 
   1577 	mov	%rsp, %rax
   1578 .Lcbc_dec_prologue:
   1579 	push	%rbp
   1580 	push	%rbx
   1581 	push	%r12
   1582 	push	%r13
   1583 	push	%r14
   1584 	push	%r15
   1585 	lea	-0x48(%rsp), %rsp
   1586 ___
   1587 $code.=<<___ if ($win64);
   1588 	mov	0xa0(%rsp),$arg5	# pull ivp
   1589 	lea	-0xa0(%rsp), %rsp
   1590 	movaps	%xmm6, 0x40(%rsp)
   1591 	movaps	%xmm7, 0x50(%rsp)
   1592 	movaps	%xmm8, 0x60(%rsp)
   1593 	movaps	%xmm9, 0x70(%rsp)
   1594 	movaps	%xmm10, 0x80(%rsp)
   1595 	movaps	%xmm11, 0x90(%rsp)
   1596 	movaps	%xmm12, 0xa0(%rsp)
   1597 	movaps	%xmm13, 0xb0(%rsp)
   1598 	movaps	%xmm14, 0xc0(%rsp)
   1599 	movaps	%xmm15, 0xd0(%rsp)
   1600 .Lcbc_dec_body:
   1601 ___
   1602 $code.=<<___;
   1603 	mov	%rsp, %rbp		# backup %rsp
   1604 	mov	240($arg4), %eax	# rounds
   1605 	mov	$arg1, $inp		# backup arguments
   1606 	mov	$arg2, $out
   1607 	mov	$arg3, $len
   1608 	mov	$arg4, $key
   1609 	mov	$arg5, %rbx
   1610 	shr	\$4, $len		# bytes to blocks
   1611 
   1612 	mov	%eax, %edx		# rounds
   1613 	shl	\$7, %rax		# 128 bytes per inner round key
   1614 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   1615 	sub	%rax, %rsp
   1616 
   1617 	mov	%rsp, %rax		# pass key schedule
   1618 	mov	$key, %rcx		# pass key
   1619 	mov	%edx, %r10d		# pass rounds
   1620 	call	_bsaes_key_convert
   1621 	pxor	(%rsp),%xmm7		# fix up 0 round key
   1622 	movdqa	%xmm6,(%rax)		# save last round key
   1623 	movdqa	%xmm7,(%rsp)
   1624 
   1625 	movdqu	(%rbx), @XMM[15]	# load IV
   1626 	sub	\$8,$len
   1627 .Lcbc_dec_loop:
   1628 	movdqu	0x00($inp), @XMM[0]	# load input
   1629 	movdqu	0x10($inp), @XMM[1]
   1630 	movdqu	0x20($inp), @XMM[2]
   1631 	movdqu	0x30($inp), @XMM[3]
   1632 	movdqu	0x40($inp), @XMM[4]
   1633 	movdqu	0x50($inp), @XMM[5]
   1634 	mov	%rsp, %rax		# pass key schedule
   1635 	movdqu	0x60($inp), @XMM[6]
   1636 	mov	%edx,%r10d		# pass rounds
   1637 	movdqu	0x70($inp), @XMM[7]
   1638 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1639 
   1640 	call	_bsaes_decrypt8
   1641 
   1642 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1643 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1644 	movdqu	0x10($inp), @XMM[9]
   1645 	pxor	@XMM[8], @XMM[1]
   1646 	movdqu	0x20($inp), @XMM[10]
   1647 	pxor	@XMM[9], @XMM[6]
   1648 	movdqu	0x30($inp), @XMM[11]
   1649 	pxor	@XMM[10], @XMM[4]
   1650 	movdqu	0x40($inp), @XMM[12]
   1651 	pxor	@XMM[11], @XMM[2]
   1652 	movdqu	0x50($inp), @XMM[13]
   1653 	pxor	@XMM[12], @XMM[7]
   1654 	movdqu	0x60($inp), @XMM[14]
   1655 	pxor	@XMM[13], @XMM[3]
   1656 	movdqu	0x70($inp), @XMM[15]	# IV
   1657 	pxor	@XMM[14], @XMM[5]
   1658 	movdqu	@XMM[0], 0x00($out)	# write output
   1659 	lea	0x80($inp), $inp
   1660 	movdqu	@XMM[1], 0x10($out)
   1661 	movdqu	@XMM[6], 0x20($out)
   1662 	movdqu	@XMM[4], 0x30($out)
   1663 	movdqu	@XMM[2], 0x40($out)
   1664 	movdqu	@XMM[7], 0x50($out)
   1665 	movdqu	@XMM[3], 0x60($out)
   1666 	movdqu	@XMM[5], 0x70($out)
   1667 	lea	0x80($out), $out
   1668 	sub	\$8,$len
   1669 	jnc	.Lcbc_dec_loop
   1670 
   1671 	add	\$8,$len
   1672 	jz	.Lcbc_dec_done
   1673 
   1674 	movdqu	0x00($inp), @XMM[0]	# load input
   1675 	mov	%rsp, %rax		# pass key schedule
   1676 	mov	%edx, %r10d		# pass rounds
   1677 	cmp	\$2,$len
   1678 	jb	.Lcbc_dec_one
   1679 	movdqu	0x10($inp), @XMM[1]
   1680 	je	.Lcbc_dec_two
   1681 	movdqu	0x20($inp), @XMM[2]
   1682 	cmp	\$4,$len
   1683 	jb	.Lcbc_dec_three
   1684 	movdqu	0x30($inp), @XMM[3]
   1685 	je	.Lcbc_dec_four
   1686 	movdqu	0x40($inp), @XMM[4]
   1687 	cmp	\$6,$len
   1688 	jb	.Lcbc_dec_five
   1689 	movdqu	0x50($inp), @XMM[5]
   1690 	je	.Lcbc_dec_six
   1691 	movdqu	0x60($inp), @XMM[6]
   1692 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1693 	call	_bsaes_decrypt8
   1694 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1695 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1696 	movdqu	0x10($inp), @XMM[9]
   1697 	pxor	@XMM[8], @XMM[1]
   1698 	movdqu	0x20($inp), @XMM[10]
   1699 	pxor	@XMM[9], @XMM[6]
   1700 	movdqu	0x30($inp), @XMM[11]
   1701 	pxor	@XMM[10], @XMM[4]
   1702 	movdqu	0x40($inp), @XMM[12]
   1703 	pxor	@XMM[11], @XMM[2]
   1704 	movdqu	0x50($inp), @XMM[13]
   1705 	pxor	@XMM[12], @XMM[7]
   1706 	movdqu	0x60($inp), @XMM[15]	# IV
   1707 	pxor	@XMM[13], @XMM[3]
   1708 	movdqu	@XMM[0], 0x00($out)	# write output
   1709 	movdqu	@XMM[1], 0x10($out)
   1710 	movdqu	@XMM[6], 0x20($out)
   1711 	movdqu	@XMM[4], 0x30($out)
   1712 	movdqu	@XMM[2], 0x40($out)
   1713 	movdqu	@XMM[7], 0x50($out)
   1714 	movdqu	@XMM[3], 0x60($out)
   1715 	jmp	.Lcbc_dec_done
   1716 .align	16
   1717 .Lcbc_dec_six:
   1718 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1719 	call	_bsaes_decrypt8
   1720 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1721 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1722 	movdqu	0x10($inp), @XMM[9]
   1723 	pxor	@XMM[8], @XMM[1]
   1724 	movdqu	0x20($inp), @XMM[10]
   1725 	pxor	@XMM[9], @XMM[6]
   1726 	movdqu	0x30($inp), @XMM[11]
   1727 	pxor	@XMM[10], @XMM[4]
   1728 	movdqu	0x40($inp), @XMM[12]
   1729 	pxor	@XMM[11], @XMM[2]
   1730 	movdqu	0x50($inp), @XMM[15]	# IV
   1731 	pxor	@XMM[12], @XMM[7]
   1732 	movdqu	@XMM[0], 0x00($out)	# write output
   1733 	movdqu	@XMM[1], 0x10($out)
   1734 	movdqu	@XMM[6], 0x20($out)
   1735 	movdqu	@XMM[4], 0x30($out)
   1736 	movdqu	@XMM[2], 0x40($out)
   1737 	movdqu	@XMM[7], 0x50($out)
   1738 	jmp	.Lcbc_dec_done
   1739 .align	16
   1740 .Lcbc_dec_five:
   1741 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1742 	call	_bsaes_decrypt8
   1743 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1744 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1745 	movdqu	0x10($inp), @XMM[9]
   1746 	pxor	@XMM[8], @XMM[1]
   1747 	movdqu	0x20($inp), @XMM[10]
   1748 	pxor	@XMM[9], @XMM[6]
   1749 	movdqu	0x30($inp), @XMM[11]
   1750 	pxor	@XMM[10], @XMM[4]
   1751 	movdqu	0x40($inp), @XMM[15]	# IV
   1752 	pxor	@XMM[11], @XMM[2]
   1753 	movdqu	@XMM[0], 0x00($out)	# write output
   1754 	movdqu	@XMM[1], 0x10($out)
   1755 	movdqu	@XMM[6], 0x20($out)
   1756 	movdqu	@XMM[4], 0x30($out)
   1757 	movdqu	@XMM[2], 0x40($out)
   1758 	jmp	.Lcbc_dec_done
   1759 .align	16
   1760 .Lcbc_dec_four:
   1761 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1762 	call	_bsaes_decrypt8
   1763 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1764 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1765 	movdqu	0x10($inp), @XMM[9]
   1766 	pxor	@XMM[8], @XMM[1]
   1767 	movdqu	0x20($inp), @XMM[10]
   1768 	pxor	@XMM[9], @XMM[6]
   1769 	movdqu	0x30($inp), @XMM[15]	# IV
   1770 	pxor	@XMM[10], @XMM[4]
   1771 	movdqu	@XMM[0], 0x00($out)	# write output
   1772 	movdqu	@XMM[1], 0x10($out)
   1773 	movdqu	@XMM[6], 0x20($out)
   1774 	movdqu	@XMM[4], 0x30($out)
   1775 	jmp	.Lcbc_dec_done
   1776 .align	16
   1777 .Lcbc_dec_three:
   1778 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1779 	call	_bsaes_decrypt8
   1780 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1781 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1782 	movdqu	0x10($inp), @XMM[9]
   1783 	pxor	@XMM[8], @XMM[1]
   1784 	movdqu	0x20($inp), @XMM[15]	# IV
   1785 	pxor	@XMM[9], @XMM[6]
   1786 	movdqu	@XMM[0], 0x00($out)	# write output
   1787 	movdqu	@XMM[1], 0x10($out)
   1788 	movdqu	@XMM[6], 0x20($out)
   1789 	jmp	.Lcbc_dec_done
   1790 .align	16
   1791 .Lcbc_dec_two:
   1792 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1793 	call	_bsaes_decrypt8
   1794 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1795 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1796 	movdqu	0x10($inp), @XMM[15]	# IV
   1797 	pxor	@XMM[8], @XMM[1]
   1798 	movdqu	@XMM[0], 0x00($out)	# write output
   1799 	movdqu	@XMM[1], 0x10($out)
   1800 	jmp	.Lcbc_dec_done
   1801 .align	16
   1802 .Lcbc_dec_one:
   1803 	lea	($inp), $arg1
   1804 	lea	0x20(%rbp), $arg2	# buffer output
   1805 	lea	($key), $arg3
   1806 	call	asm_AES_decrypt		# doesn't touch %xmm
   1807 	pxor	0x20(%rbp), @XMM[15]	# ^= IV
   1808 	movdqu	@XMM[15], ($out)	# write output
   1809 	movdqa	@XMM[0], @XMM[15]	# IV
   1810 
   1811 .Lcbc_dec_done:
   1812 	movdqu	@XMM[15], (%rbx)	# return IV
   1813 	lea	(%rsp), %rax
   1814 	pxor	%xmm0, %xmm0
   1815 .Lcbc_dec_bzero:			# wipe key schedule [if any]
   1816 	movdqa	%xmm0, 0x00(%rax)
   1817 	movdqa	%xmm0, 0x10(%rax)
   1818 	lea	0x20(%rax), %rax
   1819 	cmp	%rax, %rbp
   1820 	ja	.Lcbc_dec_bzero
   1821 
   1822 	lea	0x78(%rbp),%rax
   1823 ___
   1824 $code.=<<___ if ($win64);
   1825 	movaps	0x40(%rbp), %xmm6
   1826 	movaps	0x50(%rbp), %xmm7
   1827 	movaps	0x60(%rbp), %xmm8
   1828 	movaps	0x70(%rbp), %xmm9
   1829 	movaps	0x80(%rbp), %xmm10
   1830 	movaps	0x90(%rbp), %xmm11
   1831 	movaps	0xa0(%rbp), %xmm12
   1832 	movaps	0xb0(%rbp), %xmm13
   1833 	movaps	0xc0(%rbp), %xmm14
   1834 	movaps	0xd0(%rbp), %xmm15
   1835 	lea	0xa0(%rax), %rax
   1836 .Lcbc_dec_tail:
   1837 ___
   1838 $code.=<<___;
   1839 	mov	-48(%rax), %r15
   1840 	mov	-40(%rax), %r14
   1841 	mov	-32(%rax), %r13
   1842 	mov	-24(%rax), %r12
   1843 	mov	-16(%rax), %rbx
   1844 	mov	-8(%rax), %rbp
   1845 	lea	(%rax), %rsp		# restore %rsp
   1846 .Lcbc_dec_epilogue:
   1847 	ret
   1848 .size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
   1849 
   1850 .globl	bsaes_ctr32_encrypt_blocks
   1851 .type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
   1852 .align	16
   1853 bsaes_ctr32_encrypt_blocks:
   1854 	mov	%rsp, %rax
   1855 .Lctr_enc_prologue:
   1856 	push	%rbp
   1857 	push	%rbx
   1858 	push	%r12
   1859 	push	%r13
   1860 	push	%r14
   1861 	push	%r15
   1862 	lea	-0x48(%rsp), %rsp
   1863 ___
   1864 $code.=<<___ if ($win64);
   1865 	mov	0xa0(%rsp),$arg5	# pull ivp
   1866 	lea	-0xa0(%rsp), %rsp
   1867 	movaps	%xmm6, 0x40(%rsp)
   1868 	movaps	%xmm7, 0x50(%rsp)
   1869 	movaps	%xmm8, 0x60(%rsp)
   1870 	movaps	%xmm9, 0x70(%rsp)
   1871 	movaps	%xmm10, 0x80(%rsp)
   1872 	movaps	%xmm11, 0x90(%rsp)
   1873 	movaps	%xmm12, 0xa0(%rsp)
   1874 	movaps	%xmm13, 0xb0(%rsp)
   1875 	movaps	%xmm14, 0xc0(%rsp)
   1876 	movaps	%xmm15, 0xd0(%rsp)
   1877 .Lctr_enc_body:
   1878 ___
   1879 $code.=<<___;
   1880 	mov	%rsp, %rbp		# backup %rsp
   1881 	movdqu	($arg5), %xmm0		# load counter
   1882 	mov	240($arg4), %eax	# rounds
   1883 	mov	$arg1, $inp		# backup arguments
   1884 	mov	$arg2, $out
   1885 	mov	$arg3, $len
   1886 	mov	$arg4, $key
   1887 	movdqa	%xmm0, 0x20(%rbp)	# copy counter
   1888 	cmp	\$8, $arg3
   1889 	jb	.Lctr_enc_short
   1890 
   1891 	mov	%eax, %ebx		# rounds
   1892 	shl	\$7, %rax		# 128 bytes per inner round key
   1893 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   1894 	sub	%rax, %rsp
   1895 
   1896 	mov	%rsp, %rax		# pass key schedule
   1897 	mov	$key, %rcx		# pass key
   1898 	mov	%ebx, %r10d		# pass rounds
   1899 	call	_bsaes_key_convert
   1900 	pxor	%xmm6,%xmm7		# fix up last round key
   1901 	movdqa	%xmm7,(%rax)		# save last round key
   1902 
   1903 	movdqa	(%rsp), @XMM[9]		# load round0 key
   1904 	lea	.LADD1(%rip), %r11
   1905 	movdqa	0x20(%rbp), @XMM[0]	# counter copy
   1906 	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
   1907 	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
   1908 	pshufb	@XMM[8], @XMM[0]
   1909 	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
   1910 	jmp	.Lctr_enc_loop
   1911 .align	16
   1912 .Lctr_enc_loop:
   1913 	movdqa	@XMM[0], 0x20(%rbp)	# save counter
   1914 	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
   1915 	movdqa	@XMM[0], @XMM[2]
   1916 	paddd	0x00(%r11), @XMM[1]	# .LADD1
   1917 	movdqa	@XMM[0], @XMM[3]
   1918 	paddd	0x10(%r11), @XMM[2]	# .LADD2
   1919 	movdqa	@XMM[0], @XMM[4]
   1920 	paddd	0x20(%r11), @XMM[3]	# .LADD3
   1921 	movdqa	@XMM[0], @XMM[5]
   1922 	paddd	0x30(%r11), @XMM[4]	# .LADD4
   1923 	movdqa	@XMM[0], @XMM[6]
   1924 	paddd	0x40(%r11), @XMM[5]	# .LADD5
   1925 	movdqa	@XMM[0], @XMM[7]
   1926 	paddd	0x50(%r11), @XMM[6]	# .LADD6
   1927 	paddd	0x60(%r11), @XMM[7]	# .LADD7
   1928 
   1929 	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
   1930 	# to flip byte order in 32-bit counter
   1931 	movdqa	(%rsp), @XMM[9]		# round 0 key
   1932 	lea	0x10(%rsp), %rax	# pass key schedule
   1933 	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
   1934 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
   1935 	pxor	@XMM[9], @XMM[1]
   1936 	pxor	@XMM[9], @XMM[2]
   1937 	pxor	@XMM[9], @XMM[3]
   1938 	 pshufb	@XMM[8], @XMM[0]
   1939 	 pshufb	@XMM[8], @XMM[1]
   1940 	pxor	@XMM[9], @XMM[4]
   1941 	pxor	@XMM[9], @XMM[5]
   1942 	 pshufb	@XMM[8], @XMM[2]
   1943 	 pshufb	@XMM[8], @XMM[3]
   1944 	pxor	@XMM[9], @XMM[6]
   1945 	pxor	@XMM[9], @XMM[7]
   1946 	 pshufb	@XMM[8], @XMM[4]
   1947 	 pshufb	@XMM[8], @XMM[5]
   1948 	 pshufb	@XMM[8], @XMM[6]
   1949 	 pshufb	@XMM[8], @XMM[7]
   1950 	lea	.LBS0(%rip), %r11	# constants table
   1951 	mov	%ebx,%r10d		# pass rounds
   1952 
   1953 	call	_bsaes_encrypt8_bitslice
   1954 
   1955 	sub	\$8,$len
   1956 	jc	.Lctr_enc_loop_done
   1957 
   1958 	movdqu	0x00($inp), @XMM[8]	# load input
   1959 	movdqu	0x10($inp), @XMM[9]
   1960 	movdqu	0x20($inp), @XMM[10]
   1961 	movdqu	0x30($inp), @XMM[11]
   1962 	movdqu	0x40($inp), @XMM[12]
   1963 	movdqu	0x50($inp), @XMM[13]
   1964 	movdqu	0x60($inp), @XMM[14]
   1965 	movdqu	0x70($inp), @XMM[15]
   1966 	lea	0x80($inp),$inp
   1967 	pxor	@XMM[0], @XMM[8]
   1968 	movdqa	0x20(%rbp), @XMM[0]	# load counter
   1969 	pxor	@XMM[9], @XMM[1]
   1970 	movdqu	@XMM[8], 0x00($out)	# write output
   1971 	pxor	@XMM[10], @XMM[4]
   1972 	movdqu	@XMM[1], 0x10($out)
   1973 	pxor	@XMM[11], @XMM[6]
   1974 	movdqu	@XMM[4], 0x20($out)
   1975 	pxor	@XMM[12], @XMM[3]
   1976 	movdqu	@XMM[6], 0x30($out)
   1977 	pxor	@XMM[13], @XMM[7]
   1978 	movdqu	@XMM[3], 0x40($out)
   1979 	pxor	@XMM[14], @XMM[2]
   1980 	movdqu	@XMM[7], 0x50($out)
   1981 	pxor	@XMM[15], @XMM[5]
   1982 	movdqu	@XMM[2], 0x60($out)
   1983 	lea	.LADD1(%rip), %r11
   1984 	movdqu	@XMM[5], 0x70($out)
   1985 	lea	0x80($out), $out
   1986 	paddd	0x70(%r11), @XMM[0]	# .LADD8
   1987 	jnz	.Lctr_enc_loop
   1988 
   1989 	jmp	.Lctr_enc_done
   1990 .align	16
   1991 .Lctr_enc_loop_done:
   1992 	add	\$8, $len
   1993 	movdqu	0x00($inp), @XMM[8]	# load input
   1994 	pxor	@XMM[8], @XMM[0]
   1995 	movdqu	@XMM[0], 0x00($out)	# write output
   1996 	cmp	\$2,$len
   1997 	jb	.Lctr_enc_done
   1998 	movdqu	0x10($inp), @XMM[9]
   1999 	pxor	@XMM[9], @XMM[1]
   2000 	movdqu	@XMM[1], 0x10($out)
   2001 	je	.Lctr_enc_done
   2002 	movdqu	0x20($inp), @XMM[10]
   2003 	pxor	@XMM[10], @XMM[4]
   2004 	movdqu	@XMM[4], 0x20($out)
   2005 	cmp	\$4,$len
   2006 	jb	.Lctr_enc_done
   2007 	movdqu	0x30($inp), @XMM[11]
   2008 	pxor	@XMM[11], @XMM[6]
   2009 	movdqu	@XMM[6], 0x30($out)
   2010 	je	.Lctr_enc_done
   2011 	movdqu	0x40($inp), @XMM[12]
   2012 	pxor	@XMM[12], @XMM[3]
   2013 	movdqu	@XMM[3], 0x40($out)
   2014 	cmp	\$6,$len
   2015 	jb	.Lctr_enc_done
   2016 	movdqu	0x50($inp), @XMM[13]
   2017 	pxor	@XMM[13], @XMM[7]
   2018 	movdqu	@XMM[7], 0x50($out)
   2019 	je	.Lctr_enc_done
   2020 	movdqu	0x60($inp), @XMM[14]
   2021 	pxor	@XMM[14], @XMM[2]
   2022 	movdqu	@XMM[2], 0x60($out)
   2023 	jmp	.Lctr_enc_done
   2024 
   2025 .align	16
   2026 .Lctr_enc_short:
   2027 	lea	0x20(%rbp), $arg1
   2028 	lea	0x30(%rbp), $arg2
   2029 	lea	($key), $arg3
   2030 	call	asm_AES_encrypt
   2031 	movdqu	($inp), @XMM[1]
   2032 	lea	16($inp), $inp
   2033 	mov	0x2c(%rbp), %eax	# load 32-bit counter
   2034 	bswap	%eax
   2035 	pxor	0x30(%rbp), @XMM[1]
   2036 	inc	%eax			# increment
   2037 	movdqu	@XMM[1], ($out)
   2038 	bswap	%eax
   2039 	lea	16($out), $out
   2040 	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
   2041 	dec	$len
   2042 	jnz	.Lctr_enc_short
   2043 
   2044 .Lctr_enc_done:
   2045 	lea	(%rsp), %rax
   2046 	pxor	%xmm0, %xmm0
   2047 .Lctr_enc_bzero:			# wipe key schedule [if any]
   2048 	movdqa	%xmm0, 0x00(%rax)
   2049 	movdqa	%xmm0, 0x10(%rax)
   2050 	lea	0x20(%rax), %rax
   2051 	cmp	%rax, %rbp
   2052 	ja	.Lctr_enc_bzero
   2053 
   2054 	lea	0x78(%rbp),%rax
   2055 ___
   2056 $code.=<<___ if ($win64);
   2057 	movaps	0x40(%rbp), %xmm6
   2058 	movaps	0x50(%rbp), %xmm7
   2059 	movaps	0x60(%rbp), %xmm8
   2060 	movaps	0x70(%rbp), %xmm9
   2061 	movaps	0x80(%rbp), %xmm10
   2062 	movaps	0x90(%rbp), %xmm11
   2063 	movaps	0xa0(%rbp), %xmm12
   2064 	movaps	0xb0(%rbp), %xmm13
   2065 	movaps	0xc0(%rbp), %xmm14
   2066 	movaps	0xd0(%rbp), %xmm15
   2067 	lea	0xa0(%rax), %rax
   2068 .Lctr_enc_tail:
   2069 ___
   2070 $code.=<<___;
   2071 	mov	-48(%rax), %r15
   2072 	mov	-40(%rax), %r14
   2073 	mov	-32(%rax), %r13
   2074 	mov	-24(%rax), %r12
   2075 	mov	-16(%rax), %rbx
   2076 	mov	-8(%rax), %rbp
   2077 	lea	(%rax), %rsp		# restore %rsp
   2078 .Lctr_enc_epilogue:
   2079 	ret
   2080 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
   2081 ___
   2082 ######################################################################
   2083 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
   2084 #	const AES_KEY *key1, const AES_KEY *key2,
   2085 #	const unsigned char iv[16]);
   2086 #
   2087 my ($twmask,$twres,$twtmp)=@XMM[13..15];
   2088 $arg6=~s/d$//;
   2089 
   2090 $code.=<<___;
   2091 .globl	bsaes_xts_encrypt
   2092 .type	bsaes_xts_encrypt,\@abi-omnipotent
   2093 .align	16
   2094 bsaes_xts_encrypt:
   2095 	mov	%rsp, %rax
   2096 .Lxts_enc_prologue:
   2097 	push	%rbp
   2098 	push	%rbx
   2099 	push	%r12
   2100 	push	%r13
   2101 	push	%r14
   2102 	push	%r15
   2103 	lea	-0x48(%rsp), %rsp
   2104 ___
   2105 $code.=<<___ if ($win64);
   2106 	mov	0xa0(%rsp),$arg5	# pull key2
   2107 	mov	0xa8(%rsp),$arg6	# pull ivp
   2108 	lea	-0xa0(%rsp), %rsp
   2109 	movaps	%xmm6, 0x40(%rsp)
   2110 	movaps	%xmm7, 0x50(%rsp)
   2111 	movaps	%xmm8, 0x60(%rsp)
   2112 	movaps	%xmm9, 0x70(%rsp)
   2113 	movaps	%xmm10, 0x80(%rsp)
   2114 	movaps	%xmm11, 0x90(%rsp)
   2115 	movaps	%xmm12, 0xa0(%rsp)
   2116 	movaps	%xmm13, 0xb0(%rsp)
   2117 	movaps	%xmm14, 0xc0(%rsp)
   2118 	movaps	%xmm15, 0xd0(%rsp)
   2119 .Lxts_enc_body:
   2120 ___
   2121 $code.=<<___;
   2122 	mov	%rsp, %rbp		# backup %rsp
   2123 	mov	$arg1, $inp		# backup arguments
   2124 	mov	$arg2, $out
   2125 	mov	$arg3, $len
   2126 	mov	$arg4, $key
   2127 
   2128 	lea	($arg6), $arg1
   2129 	lea	0x20(%rbp), $arg2
   2130 	lea	($arg5), $arg3
   2131 	call	asm_AES_encrypt		# generate initial tweak
   2132 
   2133 	mov	240($key), %eax		# rounds
   2134 	mov	$len, %rbx		# backup $len
   2135 
   2136 	mov	%eax, %edx		# rounds
   2137 	shl	\$7, %rax		# 128 bytes per inner round key
   2138 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   2139 	sub	%rax, %rsp
   2140 
   2141 	mov	%rsp, %rax		# pass key schedule
   2142 	mov	$key, %rcx		# pass key
   2143 	mov	%edx, %r10d		# pass rounds
   2144 	call	_bsaes_key_convert
   2145 	pxor	%xmm6, %xmm7		# fix up last round key
   2146 	movdqa	%xmm7, (%rax)		# save last round key
   2147 
   2148 	and	\$-16, $len
   2149 	sub	\$0x80, %rsp		# place for tweak[8]
   2150 	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
   2151 
   2152 	pxor	$twtmp, $twtmp
   2153 	movdqa	.Lxts_magic(%rip), $twmask
   2154 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2155 
   2156 	sub	\$0x80, $len
   2157 	jc	.Lxts_enc_short
   2158 	jmp	.Lxts_enc_loop
   2159 
   2160 .align	16
   2161 .Lxts_enc_loop:
   2162 ___
   2163     for ($i=0;$i<7;$i++) {
   2164     $code.=<<___;
   2165 	pshufd	\$0x13, $twtmp, $twres
   2166 	pxor	$twtmp, $twtmp
   2167 	movdqa	@XMM[7], @XMM[$i]
   2168 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2169 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2170 	pand	$twmask, $twres		# isolate carry and residue
   2171 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2172 	pxor	$twres, @XMM[7]
   2173 ___
   2174     $code.=<<___ if ($i>=1);
   2175 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2176 ___
   2177     $code.=<<___ if ($i>=2);
   2178 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2179 ___
   2180     }
   2181 $code.=<<___;
   2182 	movdqu	0x60($inp), @XMM[8+6]
   2183 	pxor	@XMM[8+5], @XMM[5]
   2184 	movdqu	0x70($inp), @XMM[8+7]
   2185 	lea	0x80($inp), $inp
   2186 	movdqa	@XMM[7], 0x70(%rsp)
   2187 	pxor	@XMM[8+6], @XMM[6]
   2188 	lea	0x80(%rsp), %rax	# pass key schedule
   2189 	pxor	@XMM[8+7], @XMM[7]
   2190 	mov	%edx, %r10d		# pass rounds
   2191 
   2192 	call	_bsaes_encrypt8
   2193 
   2194 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2195 	pxor	0x10(%rsp), @XMM[1]
   2196 	movdqu	@XMM[0], 0x00($out)	# write output
   2197 	pxor	0x20(%rsp), @XMM[4]
   2198 	movdqu	@XMM[1], 0x10($out)
   2199 	pxor	0x30(%rsp), @XMM[6]
   2200 	movdqu	@XMM[4], 0x20($out)
   2201 	pxor	0x40(%rsp), @XMM[3]
   2202 	movdqu	@XMM[6], 0x30($out)
   2203 	pxor	0x50(%rsp), @XMM[7]
   2204 	movdqu	@XMM[3], 0x40($out)
   2205 	pxor	0x60(%rsp), @XMM[2]
   2206 	movdqu	@XMM[7], 0x50($out)
   2207 	pxor	0x70(%rsp), @XMM[5]
   2208 	movdqu	@XMM[2], 0x60($out)
   2209 	movdqu	@XMM[5], 0x70($out)
   2210 	lea	0x80($out), $out
   2211 
   2212 	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
   2213 	pxor	$twtmp, $twtmp
   2214 	movdqa	.Lxts_magic(%rip), $twmask
   2215 	pcmpgtd	@XMM[7], $twtmp
   2216 	pshufd	\$0x13, $twtmp, $twres
   2217 	pxor	$twtmp, $twtmp
   2218 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2219 	pand	$twmask, $twres		# isolate carry and residue
   2220 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2221 	pxor	$twres, @XMM[7]
   2222 
   2223 	sub	\$0x80,$len
   2224 	jnc	.Lxts_enc_loop
   2225 
   2226 .Lxts_enc_short:
   2227 	add	\$0x80, $len
   2228 	jz	.Lxts_enc_done
   2229 ___
   2230     for ($i=0;$i<7;$i++) {
   2231     $code.=<<___;
   2232 	pshufd	\$0x13, $twtmp, $twres
   2233 	pxor	$twtmp, $twtmp
   2234 	movdqa	@XMM[7], @XMM[$i]
   2235 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2236 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2237 	pand	$twmask, $twres		# isolate carry and residue
   2238 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2239 	pxor	$twres, @XMM[7]
   2240 ___
   2241     $code.=<<___ if ($i>=1);
   2242 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2243 	cmp	\$`0x10*$i`,$len
   2244 	je	.Lxts_enc_$i
   2245 ___
   2246     $code.=<<___ if ($i>=2);
   2247 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2248 ___
   2249     }
   2250 $code.=<<___;
   2251 	movdqu	0x60($inp), @XMM[8+6]
   2252 	pxor	@XMM[8+5], @XMM[5]
   2253 	movdqa	@XMM[7], 0x70(%rsp)
   2254 	lea	0x70($inp), $inp
   2255 	pxor	@XMM[8+6], @XMM[6]
   2256 	lea	0x80(%rsp), %rax	# pass key schedule
   2257 	mov	%edx, %r10d		# pass rounds
   2258 
   2259 	call	_bsaes_encrypt8
   2260 
   2261 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2262 	pxor	0x10(%rsp), @XMM[1]
   2263 	movdqu	@XMM[0], 0x00($out)	# write output
   2264 	pxor	0x20(%rsp), @XMM[4]
   2265 	movdqu	@XMM[1], 0x10($out)
   2266 	pxor	0x30(%rsp), @XMM[6]
   2267 	movdqu	@XMM[4], 0x20($out)
   2268 	pxor	0x40(%rsp), @XMM[3]
   2269 	movdqu	@XMM[6], 0x30($out)
   2270 	pxor	0x50(%rsp), @XMM[7]
   2271 	movdqu	@XMM[3], 0x40($out)
   2272 	pxor	0x60(%rsp), @XMM[2]
   2273 	movdqu	@XMM[7], 0x50($out)
   2274 	movdqu	@XMM[2], 0x60($out)
   2275 	lea	0x70($out), $out
   2276 
   2277 	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
   2278 	jmp	.Lxts_enc_done
   2279 .align	16
   2280 .Lxts_enc_6:
   2281 	pxor	@XMM[8+4], @XMM[4]
   2282 	lea	0x60($inp), $inp
   2283 	pxor	@XMM[8+5], @XMM[5]
   2284 	lea	0x80(%rsp), %rax	# pass key schedule
   2285 	mov	%edx, %r10d		# pass rounds
   2286 
   2287 	call	_bsaes_encrypt8
   2288 
   2289 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2290 	pxor	0x10(%rsp), @XMM[1]
   2291 	movdqu	@XMM[0], 0x00($out)	# write output
   2292 	pxor	0x20(%rsp), @XMM[4]
   2293 	movdqu	@XMM[1], 0x10($out)
   2294 	pxor	0x30(%rsp), @XMM[6]
   2295 	movdqu	@XMM[4], 0x20($out)
   2296 	pxor	0x40(%rsp), @XMM[3]
   2297 	movdqu	@XMM[6], 0x30($out)
   2298 	pxor	0x50(%rsp), @XMM[7]
   2299 	movdqu	@XMM[3], 0x40($out)
   2300 	movdqu	@XMM[7], 0x50($out)
   2301 	lea	0x60($out), $out
   2302 
   2303 	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
   2304 	jmp	.Lxts_enc_done
   2305 .align	16
   2306 .Lxts_enc_5:
   2307 	pxor	@XMM[8+3], @XMM[3]
   2308 	lea	0x50($inp), $inp
   2309 	pxor	@XMM[8+4], @XMM[4]
   2310 	lea	0x80(%rsp), %rax	# pass key schedule
   2311 	mov	%edx, %r10d		# pass rounds
   2312 
   2313 	call	_bsaes_encrypt8
   2314 
   2315 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2316 	pxor	0x10(%rsp), @XMM[1]
   2317 	movdqu	@XMM[0], 0x00($out)	# write output
   2318 	pxor	0x20(%rsp), @XMM[4]
   2319 	movdqu	@XMM[1], 0x10($out)
   2320 	pxor	0x30(%rsp), @XMM[6]
   2321 	movdqu	@XMM[4], 0x20($out)
   2322 	pxor	0x40(%rsp), @XMM[3]
   2323 	movdqu	@XMM[6], 0x30($out)
   2324 	movdqu	@XMM[3], 0x40($out)
   2325 	lea	0x50($out), $out
   2326 
   2327 	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
   2328 	jmp	.Lxts_enc_done
   2329 .align	16
   2330 .Lxts_enc_4:
   2331 	pxor	@XMM[8+2], @XMM[2]
   2332 	lea	0x40($inp), $inp
   2333 	pxor	@XMM[8+3], @XMM[3]
   2334 	lea	0x80(%rsp), %rax	# pass key schedule
   2335 	mov	%edx, %r10d		# pass rounds
   2336 
   2337 	call	_bsaes_encrypt8
   2338 
   2339 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2340 	pxor	0x10(%rsp), @XMM[1]
   2341 	movdqu	@XMM[0], 0x00($out)	# write output
   2342 	pxor	0x20(%rsp), @XMM[4]
   2343 	movdqu	@XMM[1], 0x10($out)
   2344 	pxor	0x30(%rsp), @XMM[6]
   2345 	movdqu	@XMM[4], 0x20($out)
   2346 	movdqu	@XMM[6], 0x30($out)
   2347 	lea	0x40($out), $out
   2348 
   2349 	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
   2350 	jmp	.Lxts_enc_done
   2351 .align	16
   2352 .Lxts_enc_3:
   2353 	pxor	@XMM[8+1], @XMM[1]
   2354 	lea	0x30($inp), $inp
   2355 	pxor	@XMM[8+2], @XMM[2]
   2356 	lea	0x80(%rsp), %rax	# pass key schedule
   2357 	mov	%edx, %r10d		# pass rounds
   2358 
   2359 	call	_bsaes_encrypt8
   2360 
   2361 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2362 	pxor	0x10(%rsp), @XMM[1]
   2363 	movdqu	@XMM[0], 0x00($out)	# write output
   2364 	pxor	0x20(%rsp), @XMM[4]
   2365 	movdqu	@XMM[1], 0x10($out)
   2366 	movdqu	@XMM[4], 0x20($out)
   2367 	lea	0x30($out), $out
   2368 
   2369 	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
   2370 	jmp	.Lxts_enc_done
   2371 .align	16
   2372 .Lxts_enc_2:
   2373 	pxor	@XMM[8+0], @XMM[0]
   2374 	lea	0x20($inp), $inp
   2375 	pxor	@XMM[8+1], @XMM[1]
   2376 	lea	0x80(%rsp), %rax	# pass key schedule
   2377 	mov	%edx, %r10d		# pass rounds
   2378 
   2379 	call	_bsaes_encrypt8
   2380 
   2381 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2382 	pxor	0x10(%rsp), @XMM[1]
   2383 	movdqu	@XMM[0], 0x00($out)	# write output
   2384 	movdqu	@XMM[1], 0x10($out)
   2385 	lea	0x20($out), $out
   2386 
   2387 	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
   2388 	jmp	.Lxts_enc_done
   2389 .align	16
   2390 .Lxts_enc_1:
   2391 	pxor	@XMM[0], @XMM[8]
   2392 	lea	0x10($inp), $inp
   2393 	movdqa	@XMM[8], 0x20(%rbp)
   2394 	lea	0x20(%rbp), $arg1
   2395 	lea	0x20(%rbp), $arg2
   2396 	lea	($key), $arg3
   2397 	call	asm_AES_encrypt		# doesn't touch %xmm
   2398 	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
   2399 	#pxor	@XMM[8], @XMM[0]
   2400 	#lea	0x80(%rsp), %rax	# pass key schedule
   2401 	#mov	%edx, %r10d		# pass rounds
   2402 	#call	_bsaes_encrypt8
   2403 	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2404 	movdqu	@XMM[0], 0x00($out)	# write output
   2405 	lea	0x10($out), $out
   2406 
   2407 	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
   2408 
   2409 .Lxts_enc_done:
   2410 	and	\$15, %ebx
   2411 	jz	.Lxts_enc_ret
   2412 	mov	$out, %rdx
   2413 
   2414 .Lxts_enc_steal:
   2415 	movzb	($inp), %eax
   2416 	movzb	-16(%rdx), %ecx
   2417 	lea	1($inp), $inp
   2418 	mov	%al, -16(%rdx)
   2419 	mov	%cl, 0(%rdx)
   2420 	lea	1(%rdx), %rdx
   2421 	sub	\$1,%ebx
   2422 	jnz	.Lxts_enc_steal
   2423 
   2424 	movdqu	-16($out), @XMM[0]
   2425 	lea	0x20(%rbp), $arg1
   2426 	pxor	@XMM[7], @XMM[0]
   2427 	lea	0x20(%rbp), $arg2
   2428 	movdqa	@XMM[0], 0x20(%rbp)
   2429 	lea	($key), $arg3
   2430 	call	asm_AES_encrypt		# doesn't touch %xmm
   2431 	pxor	0x20(%rbp), @XMM[7]
   2432 	movdqu	@XMM[7], -16($out)
   2433 
   2434 .Lxts_enc_ret:
   2435 	lea	(%rsp), %rax
   2436 	pxor	%xmm0, %xmm0
   2437 .Lxts_enc_bzero:			# wipe key schedule [if any]
   2438 	movdqa	%xmm0, 0x00(%rax)
   2439 	movdqa	%xmm0, 0x10(%rax)
   2440 	lea	0x20(%rax), %rax
   2441 	cmp	%rax, %rbp
   2442 	ja	.Lxts_enc_bzero
   2443 
   2444 	lea	0x78(%rbp),%rax
   2445 ___
   2446 $code.=<<___ if ($win64);
   2447 	movaps	0x40(%rbp), %xmm6
   2448 	movaps	0x50(%rbp), %xmm7
   2449 	movaps	0x60(%rbp), %xmm8
   2450 	movaps	0x70(%rbp), %xmm9
   2451 	movaps	0x80(%rbp), %xmm10
   2452 	movaps	0x90(%rbp), %xmm11
   2453 	movaps	0xa0(%rbp), %xmm12
   2454 	movaps	0xb0(%rbp), %xmm13
   2455 	movaps	0xc0(%rbp), %xmm14
   2456 	movaps	0xd0(%rbp), %xmm15
   2457 	lea	0xa0(%rax), %rax
   2458 .Lxts_enc_tail:
   2459 ___
   2460 $code.=<<___;
   2461 	mov	-48(%rax), %r15
   2462 	mov	-40(%rax), %r14
   2463 	mov	-32(%rax), %r13
   2464 	mov	-24(%rax), %r12
   2465 	mov	-16(%rax), %rbx
   2466 	mov	-8(%rax), %rbp
   2467 	lea	(%rax), %rsp		# restore %rsp
   2468 .Lxts_enc_epilogue:
   2469 	ret
   2470 .size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
   2471 
   2472 .globl	bsaes_xts_decrypt
   2473 .type	bsaes_xts_decrypt,\@abi-omnipotent
   2474 .align	16
   2475 bsaes_xts_decrypt:
   2476 	mov	%rsp, %rax
   2477 .Lxts_dec_prologue:
   2478 	push	%rbp
   2479 	push	%rbx
   2480 	push	%r12
   2481 	push	%r13
   2482 	push	%r14
   2483 	push	%r15
   2484 	lea	-0x48(%rsp), %rsp
   2485 ___
   2486 $code.=<<___ if ($win64);
   2487 	mov	0xa0(%rsp),$arg5	# pull key2
   2488 	mov	0xa8(%rsp),$arg6	# pull ivp
   2489 	lea	-0xa0(%rsp), %rsp
   2490 	movaps	%xmm6, 0x40(%rsp)
   2491 	movaps	%xmm7, 0x50(%rsp)
   2492 	movaps	%xmm8, 0x60(%rsp)
   2493 	movaps	%xmm9, 0x70(%rsp)
   2494 	movaps	%xmm10, 0x80(%rsp)
   2495 	movaps	%xmm11, 0x90(%rsp)
   2496 	movaps	%xmm12, 0xa0(%rsp)
   2497 	movaps	%xmm13, 0xb0(%rsp)
   2498 	movaps	%xmm14, 0xc0(%rsp)
   2499 	movaps	%xmm15, 0xd0(%rsp)
   2500 .Lxts_dec_body:
   2501 ___
   2502 $code.=<<___;
   2503 	mov	%rsp, %rbp		# backup %rsp
   2504 	mov	$arg1, $inp		# backup arguments
   2505 	mov	$arg2, $out
   2506 	mov	$arg3, $len
   2507 	mov	$arg4, $key
   2508 
   2509 	lea	($arg6), $arg1
   2510 	lea	0x20(%rbp), $arg2
   2511 	lea	($arg5), $arg3
   2512 	call	asm_AES_encrypt		# generate initial tweak
   2513 
   2514 	mov	240($key), %eax		# rounds
   2515 	mov	$len, %rbx		# backup $len
   2516 
   2517 	mov	%eax, %edx		# rounds
   2518 	shl	\$7, %rax		# 128 bytes per inner round key
   2519 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   2520 	sub	%rax, %rsp
   2521 
   2522 	mov	%rsp, %rax		# pass key schedule
   2523 	mov	$key, %rcx		# pass key
   2524 	mov	%edx, %r10d		# pass rounds
   2525 	call	_bsaes_key_convert
   2526 	pxor	(%rsp), %xmm7		# fix up round 0 key
   2527 	movdqa	%xmm6, (%rax)		# save last round key
   2528 	movdqa	%xmm7, (%rsp)
   2529 
   2530 	xor	%eax, %eax		# if ($len%16) len-=16;
   2531 	and	\$-16, $len
   2532 	test	\$15, %ebx
   2533 	setnz	%al
   2534 	shl	\$4, %rax
   2535 	sub	%rax, $len
   2536 
   2537 	sub	\$0x80, %rsp		# place for tweak[8]
   2538 	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
   2539 
   2540 	pxor	$twtmp, $twtmp
   2541 	movdqa	.Lxts_magic(%rip), $twmask
   2542 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2543 
   2544 	sub	\$0x80, $len
   2545 	jc	.Lxts_dec_short
   2546 	jmp	.Lxts_dec_loop
   2547 
   2548 .align	16
   2549 .Lxts_dec_loop:
   2550 ___
   2551     for ($i=0;$i<7;$i++) {
   2552     $code.=<<___;
   2553 	pshufd	\$0x13, $twtmp, $twres
   2554 	pxor	$twtmp, $twtmp
   2555 	movdqa	@XMM[7], @XMM[$i]
   2556 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2557 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2558 	pand	$twmask, $twres		# isolate carry and residue
   2559 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2560 	pxor	$twres, @XMM[7]
   2561 ___
   2562     $code.=<<___ if ($i>=1);
   2563 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2564 ___
   2565     $code.=<<___ if ($i>=2);
   2566 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2567 ___
   2568     }
   2569 $code.=<<___;
   2570 	movdqu	0x60($inp), @XMM[8+6]
   2571 	pxor	@XMM[8+5], @XMM[5]
   2572 	movdqu	0x70($inp), @XMM[8+7]
   2573 	lea	0x80($inp), $inp
   2574 	movdqa	@XMM[7], 0x70(%rsp)
   2575 	pxor	@XMM[8+6], @XMM[6]
   2576 	lea	0x80(%rsp), %rax	# pass key schedule
   2577 	pxor	@XMM[8+7], @XMM[7]
   2578 	mov	%edx, %r10d		# pass rounds
   2579 
   2580 	call	_bsaes_decrypt8
   2581 
   2582 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2583 	pxor	0x10(%rsp), @XMM[1]
   2584 	movdqu	@XMM[0], 0x00($out)	# write output
   2585 	pxor	0x20(%rsp), @XMM[6]
   2586 	movdqu	@XMM[1], 0x10($out)
   2587 	pxor	0x30(%rsp), @XMM[4]
   2588 	movdqu	@XMM[6], 0x20($out)
   2589 	pxor	0x40(%rsp), @XMM[2]
   2590 	movdqu	@XMM[4], 0x30($out)
   2591 	pxor	0x50(%rsp), @XMM[7]
   2592 	movdqu	@XMM[2], 0x40($out)
   2593 	pxor	0x60(%rsp), @XMM[3]
   2594 	movdqu	@XMM[7], 0x50($out)
   2595 	pxor	0x70(%rsp), @XMM[5]
   2596 	movdqu	@XMM[3], 0x60($out)
   2597 	movdqu	@XMM[5], 0x70($out)
   2598 	lea	0x80($out), $out
   2599 
   2600 	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
   2601 	pxor	$twtmp, $twtmp
   2602 	movdqa	.Lxts_magic(%rip), $twmask
   2603 	pcmpgtd	@XMM[7], $twtmp
   2604 	pshufd	\$0x13, $twtmp, $twres
   2605 	pxor	$twtmp, $twtmp
   2606 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2607 	pand	$twmask, $twres		# isolate carry and residue
   2608 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2609 	pxor	$twres, @XMM[7]
   2610 
   2611 	sub	\$0x80,$len
   2612 	jnc	.Lxts_dec_loop
   2613 
   2614 .Lxts_dec_short:
   2615 	add	\$0x80, $len
   2616 	jz	.Lxts_dec_done
   2617 ___
   2618     for ($i=0;$i<7;$i++) {
   2619     $code.=<<___;
   2620 	pshufd	\$0x13, $twtmp, $twres
   2621 	pxor	$twtmp, $twtmp
   2622 	movdqa	@XMM[7], @XMM[$i]
   2623 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2624 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2625 	pand	$twmask, $twres		# isolate carry and residue
   2626 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2627 	pxor	$twres, @XMM[7]
   2628 ___
   2629     $code.=<<___ if ($i>=1);
   2630 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2631 	cmp	\$`0x10*$i`,$len
   2632 	je	.Lxts_dec_$i
   2633 ___
   2634     $code.=<<___ if ($i>=2);
   2635 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2636 ___
   2637     }
   2638 $code.=<<___;
   2639 	movdqu	0x60($inp), @XMM[8+6]
   2640 	pxor	@XMM[8+5], @XMM[5]
   2641 	movdqa	@XMM[7], 0x70(%rsp)
   2642 	lea	0x70($inp), $inp
   2643 	pxor	@XMM[8+6], @XMM[6]
   2644 	lea	0x80(%rsp), %rax	# pass key schedule
   2645 	mov	%edx, %r10d		# pass rounds
   2646 
   2647 	call	_bsaes_decrypt8
   2648 
   2649 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2650 	pxor	0x10(%rsp), @XMM[1]
   2651 	movdqu	@XMM[0], 0x00($out)	# write output
   2652 	pxor	0x20(%rsp), @XMM[6]
   2653 	movdqu	@XMM[1], 0x10($out)
   2654 	pxor	0x30(%rsp), @XMM[4]
   2655 	movdqu	@XMM[6], 0x20($out)
   2656 	pxor	0x40(%rsp), @XMM[2]
   2657 	movdqu	@XMM[4], 0x30($out)
   2658 	pxor	0x50(%rsp), @XMM[7]
   2659 	movdqu	@XMM[2], 0x40($out)
   2660 	pxor	0x60(%rsp), @XMM[3]
   2661 	movdqu	@XMM[7], 0x50($out)
   2662 	movdqu	@XMM[3], 0x60($out)
   2663 	lea	0x70($out), $out
   2664 
   2665 	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
   2666 	jmp	.Lxts_dec_done
   2667 .align	16
   2668 .Lxts_dec_6:
   2669 	pxor	@XMM[8+4], @XMM[4]
   2670 	lea	0x60($inp), $inp
   2671 	pxor	@XMM[8+5], @XMM[5]
   2672 	lea	0x80(%rsp), %rax	# pass key schedule
   2673 	mov	%edx, %r10d		# pass rounds
   2674 
   2675 	call	_bsaes_decrypt8
   2676 
   2677 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2678 	pxor	0x10(%rsp), @XMM[1]
   2679 	movdqu	@XMM[0], 0x00($out)	# write output
   2680 	pxor	0x20(%rsp), @XMM[6]
   2681 	movdqu	@XMM[1], 0x10($out)
   2682 	pxor	0x30(%rsp), @XMM[4]
   2683 	movdqu	@XMM[6], 0x20($out)
   2684 	pxor	0x40(%rsp), @XMM[2]
   2685 	movdqu	@XMM[4], 0x30($out)
   2686 	pxor	0x50(%rsp), @XMM[7]
   2687 	movdqu	@XMM[2], 0x40($out)
   2688 	movdqu	@XMM[7], 0x50($out)
   2689 	lea	0x60($out), $out
   2690 
   2691 	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
   2692 	jmp	.Lxts_dec_done
   2693 .align	16
   2694 .Lxts_dec_5:
   2695 	pxor	@XMM[8+3], @XMM[3]
   2696 	lea	0x50($inp), $inp
   2697 	pxor	@XMM[8+4], @XMM[4]
   2698 	lea	0x80(%rsp), %rax	# pass key schedule
   2699 	mov	%edx, %r10d		# pass rounds
   2700 
   2701 	call	_bsaes_decrypt8
   2702 
   2703 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2704 	pxor	0x10(%rsp), @XMM[1]
   2705 	movdqu	@XMM[0], 0x00($out)	# write output
   2706 	pxor	0x20(%rsp), @XMM[6]
   2707 	movdqu	@XMM[1], 0x10($out)
   2708 	pxor	0x30(%rsp), @XMM[4]
   2709 	movdqu	@XMM[6], 0x20($out)
   2710 	pxor	0x40(%rsp), @XMM[2]
   2711 	movdqu	@XMM[4], 0x30($out)
   2712 	movdqu	@XMM[2], 0x40($out)
   2713 	lea	0x50($out), $out
   2714 
   2715 	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
   2716 	jmp	.Lxts_dec_done
   2717 .align	16
   2718 .Lxts_dec_4:
   2719 	pxor	@XMM[8+2], @XMM[2]
   2720 	lea	0x40($inp), $inp
   2721 	pxor	@XMM[8+3], @XMM[3]
   2722 	lea	0x80(%rsp), %rax	# pass key schedule
   2723 	mov	%edx, %r10d		# pass rounds
   2724 
   2725 	call	_bsaes_decrypt8
   2726 
   2727 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2728 	pxor	0x10(%rsp), @XMM[1]
   2729 	movdqu	@XMM[0], 0x00($out)	# write output
   2730 	pxor	0x20(%rsp), @XMM[6]
   2731 	movdqu	@XMM[1], 0x10($out)
   2732 	pxor	0x30(%rsp), @XMM[4]
   2733 	movdqu	@XMM[6], 0x20($out)
   2734 	movdqu	@XMM[4], 0x30($out)
   2735 	lea	0x40($out), $out
   2736 
   2737 	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
   2738 	jmp	.Lxts_dec_done
   2739 .align	16
   2740 .Lxts_dec_3:
   2741 	pxor	@XMM[8+1], @XMM[1]
   2742 	lea	0x30($inp), $inp
   2743 	pxor	@XMM[8+2], @XMM[2]
   2744 	lea	0x80(%rsp), %rax	# pass key schedule
   2745 	mov	%edx, %r10d		# pass rounds
   2746 
   2747 	call	_bsaes_decrypt8
   2748 
   2749 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2750 	pxor	0x10(%rsp), @XMM[1]
   2751 	movdqu	@XMM[0], 0x00($out)	# write output
   2752 	pxor	0x20(%rsp), @XMM[6]
   2753 	movdqu	@XMM[1], 0x10($out)
   2754 	movdqu	@XMM[6], 0x20($out)
   2755 	lea	0x30($out), $out
   2756 
   2757 	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
   2758 	jmp	.Lxts_dec_done
   2759 .align	16
   2760 .Lxts_dec_2:
   2761 	pxor	@XMM[8+0], @XMM[0]
   2762 	lea	0x20($inp), $inp
   2763 	pxor	@XMM[8+1], @XMM[1]
   2764 	lea	0x80(%rsp), %rax	# pass key schedule
   2765 	mov	%edx, %r10d		# pass rounds
   2766 
   2767 	call	_bsaes_decrypt8
   2768 
   2769 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2770 	pxor	0x10(%rsp), @XMM[1]
   2771 	movdqu	@XMM[0], 0x00($out)	# write output
   2772 	movdqu	@XMM[1], 0x10($out)
   2773 	lea	0x20($out), $out
   2774 
   2775 	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
   2776 	jmp	.Lxts_dec_done
   2777 .align	16
   2778 .Lxts_dec_1:
   2779 	pxor	@XMM[0], @XMM[8]
   2780 	lea	0x10($inp), $inp
   2781 	movdqa	@XMM[8], 0x20(%rbp)
   2782 	lea	0x20(%rbp), $arg1
   2783 	lea	0x20(%rbp), $arg2
   2784 	lea	($key), $arg3
   2785 	call	asm_AES_decrypt		# doesn't touch %xmm
   2786 	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
   2787 	#pxor	@XMM[8], @XMM[0]
   2788 	#lea	0x80(%rsp), %rax	# pass key schedule
   2789 	#mov	%edx, %r10d		# pass rounds
   2790 	#call	_bsaes_decrypt8
   2791 	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2792 	movdqu	@XMM[0], 0x00($out)	# write output
   2793 	lea	0x10($out), $out
   2794 
   2795 	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
   2796 
   2797 .Lxts_dec_done:
   2798 	and	\$15, %ebx
   2799 	jz	.Lxts_dec_ret
   2800 
   2801 	pxor	$twtmp, $twtmp
   2802 	movdqa	.Lxts_magic(%rip), $twmask
   2803 	pcmpgtd	@XMM[7], $twtmp
   2804 	pshufd	\$0x13, $twtmp, $twres
   2805 	movdqa	@XMM[7], @XMM[6]
   2806 	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
   2807 	pand	$twmask, $twres		# isolate carry and residue
   2808 	movdqu	($inp), @XMM[0]
   2809 	pxor	$twres, @XMM[7]
   2810 
   2811 	lea	0x20(%rbp), $arg1
   2812 	pxor	@XMM[7], @XMM[0]
   2813 	lea	0x20(%rbp), $arg2
   2814 	movdqa	@XMM[0], 0x20(%rbp)
   2815 	lea	($key), $arg3
   2816 	call	asm_AES_decrypt		# doesn't touch %xmm
   2817 	pxor	0x20(%rbp), @XMM[7]
   2818 	mov	$out, %rdx
   2819 	movdqu	@XMM[7], ($out)
   2820 
   2821 .Lxts_dec_steal:
   2822 	movzb	16($inp), %eax
   2823 	movzb	(%rdx), %ecx
   2824 	lea	1($inp), $inp
   2825 	mov	%al, (%rdx)
   2826 	mov	%cl, 16(%rdx)
   2827 	lea	1(%rdx), %rdx
   2828 	sub	\$1,%ebx
   2829 	jnz	.Lxts_dec_steal
   2830 
   2831 	movdqu	($out), @XMM[0]
   2832 	lea	0x20(%rbp), $arg1
   2833 	pxor	@XMM[6], @XMM[0]
   2834 	lea	0x20(%rbp), $arg2
   2835 	movdqa	@XMM[0], 0x20(%rbp)
   2836 	lea	($key), $arg3
   2837 	call	asm_AES_decrypt		# doesn't touch %xmm
   2838 	pxor	0x20(%rbp), @XMM[6]
   2839 	movdqu	@XMM[6], ($out)
   2840 
   2841 .Lxts_dec_ret:
   2842 	lea	(%rsp), %rax
   2843 	pxor	%xmm0, %xmm0
   2844 .Lxts_dec_bzero:			# wipe key schedule [if any]
   2845 	movdqa	%xmm0, 0x00(%rax)
   2846 	movdqa	%xmm0, 0x10(%rax)
   2847 	lea	0x20(%rax), %rax
   2848 	cmp	%rax, %rbp
   2849 	ja	.Lxts_dec_bzero
   2850 
   2851 	lea	0x78(%rbp),%rax
   2852 ___
   2853 $code.=<<___ if ($win64);
   2854 	movaps	0x40(%rbp), %xmm6
   2855 	movaps	0x50(%rbp), %xmm7
   2856 	movaps	0x60(%rbp), %xmm8
   2857 	movaps	0x70(%rbp), %xmm9
   2858 	movaps	0x80(%rbp), %xmm10
   2859 	movaps	0x90(%rbp), %xmm11
   2860 	movaps	0xa0(%rbp), %xmm12
   2861 	movaps	0xb0(%rbp), %xmm13
   2862 	movaps	0xc0(%rbp), %xmm14
   2863 	movaps	0xd0(%rbp), %xmm15
   2864 	lea	0xa0(%rax), %rax
   2865 .Lxts_dec_tail:
   2866 ___
   2867 $code.=<<___;
   2868 	mov	-48(%rax), %r15
   2869 	mov	-40(%rax), %r14
   2870 	mov	-32(%rax), %r13
   2871 	mov	-24(%rax), %r12
   2872 	mov	-16(%rax), %rbx
   2873 	mov	-8(%rax), %rbp
   2874 	lea	(%rax), %rsp		# restore %rsp
   2875 .Lxts_dec_epilogue:
   2876 	ret
   2877 .size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
   2878 ___
   2879 }
   2880 $code.=<<___;
   2881 .type	_bsaes_const,\@object
   2882 .align	64
   2883 _bsaes_const:
   2884 .LM0ISR:	# InvShiftRows constants
   2885 	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
   2886 .LISRM0:
   2887 	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
   2888 .LISR:
   2889 	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
   2890 .LBS0:		# bit-slice constants
   2891 	.quad	0x5555555555555555, 0x5555555555555555
   2892 .LBS1:
   2893 	.quad	0x3333333333333333, 0x3333333333333333
   2894 .LBS2:
   2895 	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
   2896 .LSR:		# shiftrows constants
   2897 	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
   2898 .LSRM0:
   2899 	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
   2900 .LM0SR:
   2901 	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
   2902 .LSWPUP:	# byte-swap upper dword
   2903 	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
   2904 .LSWPUPM0SR:
   2905 	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
   2906 .LADD1:		# counter increment constants
   2907 	.quad	0x0000000000000000, 0x0000000100000000
   2908 .LADD2:
   2909 	.quad	0x0000000000000000, 0x0000000200000000
   2910 .LADD3:
   2911 	.quad	0x0000000000000000, 0x0000000300000000
   2912 .LADD4:
   2913 	.quad	0x0000000000000000, 0x0000000400000000
   2914 .LADD5:
   2915 	.quad	0x0000000000000000, 0x0000000500000000
   2916 .LADD6:
   2917 	.quad	0x0000000000000000, 0x0000000600000000
   2918 .LADD7:
   2919 	.quad	0x0000000000000000, 0x0000000700000000
   2920 .LADD8:
   2921 	.quad	0x0000000000000000, 0x0000000800000000
   2922 .Lxts_magic:
   2923 	.long	0x87,0,1,0
   2924 .Lmasks:
   2925 	.quad	0x0101010101010101, 0x0101010101010101
   2926 	.quad	0x0202020202020202, 0x0202020202020202
   2927 	.quad	0x0404040404040404, 0x0404040404040404
   2928 	.quad	0x0808080808080808, 0x0808080808080808
   2929 .LM0:
   2930 	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
   2931 .L63:
   2932 	.quad	0x6363636363636363, 0x6363636363636363
   2933 .asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Ksper, Peter Schwabe, Andy Polyakov"
   2934 .align	64
   2935 .size	_bsaes_const,.-_bsaes_const
   2936 ___
   2937 
   2938 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2939 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2940 if ($win64) {
   2941 $rec="%rcx";
   2942 $frame="%rdx";
   2943 $context="%r8";
   2944 $disp="%r9";
   2945 
   2946 $code.=<<___;
   2947 .extern	__imp_RtlVirtualUnwind
   2948 .type	se_handler,\@abi-omnipotent
   2949 .align	16
   2950 se_handler:
   2951 	push	%rsi
   2952 	push	%rdi
   2953 	push	%rbx
   2954 	push	%rbp
   2955 	push	%r12
   2956 	push	%r13
   2957 	push	%r14
   2958 	push	%r15
   2959 	pushfq
   2960 	sub	\$64,%rsp
   2961 
   2962 	mov	120($context),%rax	# pull context->Rax
   2963 	mov	248($context),%rbx	# pull context->Rip
   2964 
   2965 	mov	8($disp),%rsi		# disp->ImageBase
   2966 	mov	56($disp),%r11		# disp->HandlerData
   2967 
   2968 	mov	0(%r11),%r10d		# HandlerData[0]
   2969 	lea	(%rsi,%r10),%r10	# prologue label
   2970 	cmp	%r10,%rbx		# context->Rip<=prologue label
   2971 	jbe	.Lin_prologue
   2972 
   2973 	mov	4(%r11),%r10d		# HandlerData[1]
   2974 	lea	(%rsi,%r10),%r10	# epilogue label
   2975 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2976 	jae	.Lin_prologue
   2977 
   2978 	mov	8(%r11),%r10d		# HandlerData[2]
   2979 	lea	(%rsi,%r10),%r10	# epilogue label
   2980 	cmp	%r10,%rbx		# context->Rip>=tail label
   2981 	jae	.Lin_tail
   2982 
   2983 	mov	160($context),%rax	# pull context->Rbp
   2984 
   2985 	lea	0x40(%rax),%rsi		# %xmm save area
   2986 	lea	512($context),%rdi	# &context.Xmm6
   2987 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
   2988 	.long	0xa548f3fc		# cld; rep movsq
   2989 	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
   2990 
   2991 .Lin_tail:
   2992 	mov	-48(%rax),%rbp
   2993 	mov	-40(%rax),%rbx
   2994 	mov	-32(%rax),%r12
   2995 	mov	-24(%rax),%r13
   2996 	mov	-16(%rax),%r14
   2997 	mov	-8(%rax),%r15
   2998 	mov	%rbx,144($context)	# restore context->Rbx
   2999 	mov	%rbp,160($context)	# restore context->Rbp
   3000 	mov	%r12,216($context)	# restore context->R12
   3001 	mov	%r13,224($context)	# restore context->R13
   3002 	mov	%r14,232($context)	# restore context->R14
   3003 	mov	%r15,240($context)	# restore context->R15
   3004 
   3005 .Lin_prologue:
   3006 	mov	%rax,152($context)	# restore context->Rsp
   3007 
   3008 	mov	40($disp),%rdi		# disp->ContextRecord
   3009 	mov	$context,%rsi		# context
   3010 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   3011 	.long	0xa548f3fc		# cld; rep movsq
   3012 
   3013 	mov	$disp,%rsi
   3014 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   3015 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   3016 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   3017 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   3018 	mov	40(%rsi),%r10		# disp->ContextRecord
   3019 	lea	56(%rsi),%r11		# &disp->HandlerData
   3020 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   3021 	mov	%r10,32(%rsp)		# arg5
   3022 	mov	%r11,40(%rsp)		# arg6
   3023 	mov	%r12,48(%rsp)		# arg7
   3024 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   3025 	call	*__imp_RtlVirtualUnwind(%rip)
   3026 
   3027 	mov	\$1,%eax		# ExceptionContinueSearch
   3028 	add	\$64,%rsp
   3029 	popfq
   3030 	pop	%r15
   3031 	pop	%r14
   3032 	pop	%r13
   3033 	pop	%r12
   3034 	pop	%rbp
   3035 	pop	%rbx
   3036 	pop	%rdi
   3037 	pop	%rsi
   3038 	ret
   3039 .size	se_handler,.-se_handler
   3040 
   3041 .section	.pdata
   3042 .align	4
   3043 ___
   3044 $code.=<<___ if ($ecb);
   3045 	.rva	.Lecb_enc_prologue
   3046 	.rva	.Lecb_enc_epilogue
   3047 	.rva	.Lecb_enc_info
   3048 
   3049 	.rva	.Lecb_dec_prologue
   3050 	.rva	.Lecb_dec_epilogue
   3051 	.rva	.Lecb_dec_info
   3052 ___
   3053 $code.=<<___;
   3054 	.rva	.Lcbc_dec_prologue
   3055 	.rva	.Lcbc_dec_epilogue
   3056 	.rva	.Lcbc_dec_info
   3057 
   3058 	.rva	.Lctr_enc_prologue
   3059 	.rva	.Lctr_enc_epilogue
   3060 	.rva	.Lctr_enc_info
   3061 
   3062 	.rva	.Lxts_enc_prologue
   3063 	.rva	.Lxts_enc_epilogue
   3064 	.rva	.Lxts_enc_info
   3065 
   3066 	.rva	.Lxts_dec_prologue
   3067 	.rva	.Lxts_dec_epilogue
   3068 	.rva	.Lxts_dec_info
   3069 
   3070 .section	.xdata
   3071 .align	8
   3072 ___
   3073 $code.=<<___ if ($ecb);
   3074 .Lecb_enc_info:
   3075 	.byte	9,0,0,0
   3076 	.rva	se_handler
   3077 	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
   3078 	.rva	.Lecb_enc_tail
   3079 	.long	0
   3080 .Lecb_dec_info:
   3081 	.byte	9,0,0,0
   3082 	.rva	se_handler
   3083 	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
   3084 	.rva	.Lecb_dec_tail
   3085 	.long	0
   3086 ___
   3087 $code.=<<___;
   3088 .Lcbc_dec_info:
   3089 	.byte	9,0,0,0
   3090 	.rva	se_handler
   3091 	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
   3092 	.rva	.Lcbc_dec_tail
   3093 	.long	0
   3094 .Lctr_enc_info:
   3095 	.byte	9,0,0,0
   3096 	.rva	se_handler
   3097 	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
   3098 	.rva	.Lctr_enc_tail
   3099 	.long	0
   3100 .Lxts_enc_info:
   3101 	.byte	9,0,0,0
   3102 	.rva	se_handler
   3103 	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
   3104 	.rva	.Lxts_enc_tail
   3105 	.long	0
   3106 .Lxts_dec_info:
   3107 	.byte	9,0,0,0
   3108 	.rva	se_handler
   3109 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
   3110 	.rva	.Lxts_dec_tail
   3111 	.long	0
   3112 ___
   3113 }
   3114 
   3115 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   3116 
   3117 print $code;
   3118 
   3119 close STDOUT;
   3120