Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 ###################################################################
      4 ### AES-128 [originally in CTR mode]				###
      5 ### bitsliced implementation for Intel Core 2 processors	###
      6 ### requires support of SSE extensions up to SSSE3		###
      7 ### Author: Emilia Ksper and Peter Schwabe			###
      8 ### Date: 2009-03-19						###
      9 ### Public domain						###
     10 ###								###
     11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
     12 ### further information.					###
     13 ###################################################################
     14 #
     15 # September 2011.
     16 #
     17 # Started as transliteration to "perlasm" the original code has
     18 # undergone following changes:
     19 #
     20 # - code was made position-independent;
     21 # - rounds were folded into a loop resulting in >5x size reduction
     22 #   from 12.5KB to 2.2KB;
     23 # - above was possibile thanks to mixcolumns() modification that
     24 #   allowed to feed its output back to aesenc[last], this was
     25 #   achieved at cost of two additional inter-registers moves;
     26 # - some instruction reordering and interleaving;
     27 # - this module doesn't implement key setup subroutine, instead it
     28 #   relies on conversion of "conventional" key schedule as returned
     29 #   by AES_set_encrypt_key (see discussion below);
     30 # - first and last round keys are treated differently, which allowed
     31 #   to skip one shiftrows(), reduce bit-sliced key schedule and
     32 #   speed-up conversion by 22%;
     33 # - support for 192- and 256-bit keys was added;
     34 #
     35 # Resulting performance in CPU cycles spent to encrypt one byte out
     36 # of 4096-byte buffer with 128-bit key is:
     37 #
     38 #		Emilia's	this(*)		difference
     39 #
     40 # Core 2    	9.30		8.69		+7%
     41 # Nehalem(**) 	7.63		6.98		+9%
     42 # Atom	    	17.1		17.4		-2%(***)
     43 #
     44 # (*)	Comparison is not completely fair, because "this" is ECB,
     45 #	i.e. no extra processing such as counter values calculation
     46 #	and xor-ing input as in Emilia's CTR implementation is
     47 #	performed. However, the CTR calculations stand for not more
     48 #	than 1% of total time, so comparison is *rather* fair.
     49 #
     50 # (**)	Results were collected on Westmere, which is considered to
     51 #	be equivalent to Nehalem for this code.
     52 #
     53 # (***)	Slowdown on Atom is rather strange per se, because original
     54 #	implementation has a number of 9+-bytes instructions, which
     55 #	are bad for Atom front-end, and which I eliminated completely.
     56 #	In attempt to address deterioration sbox() was tested in FP
     57 #	SIMD "domain" (movaps instead of movdqa, xorps instead of
     58 #	pxor, etc.). While it resulted in nominal 4% improvement on
     59 #	Atom, it hurted Westmere by more than 2x factor.
     60 #
     61 # As for key schedule conversion subroutine. Interface to OpenSSL
     62 # relies on per-invocation on-the-fly conversion. This naturally
     63 # has impact on performance, especially for short inputs. Conversion
     64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
     65 # function is:
     66 #
     67 # 		conversion	conversion/8x block
     68 # Core 2	240		0.22
     69 # Nehalem	180		0.20
     70 # Atom		430		0.19
     71 #
     72 # The ratio values mean that 128-byte blocks will be processed
     73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
     74 # etc. Then keep in mind that input sizes not divisible by 128 are
     75 # *effectively* slower, especially shortest ones, e.g. consecutive
     76 # 144-byte blocks are processed 44% slower than one would expect,
     77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
     78 # it's still faster than ["hyper-threading-safe" code path in]
     79 # aes-x86_64.pl on all lengths above 64 bytes...
     80 #
     81 # October 2011.
     82 #
     83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
     84 # one byte out of 4096-byte buffer with 128-bit key is:
     85 #
     86 # Core 2	11.0
     87 # Nehalem	9.16
     88 # Atom		20.9
     89 #
     90 # November 2011.
     91 #
     92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
     93 # suboptimal, but XTS is meant to be used with larger blocks...
     94 #
     95 #						<appro (at] openssl.org>
     96 
     97 $flavour = shift;
     98 $output  = shift;
     99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    100 
    101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    102 
    103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    106 die "can't locate x86_64-xlate.pl";
    107 
    108 open OUT,"| \"$^X\" $xlate $flavour $output";
    109 *STDOUT=*OUT;
    110 
    111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
    112 my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
    113 my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
    114 
    115 {
    116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
    117 
    118 sub Sbox {
    119 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
    121 my @b=@_[0..7];
    122 my @t=@_[8..11];
    123 my @s=@_[12..15];
    124 	&InBasisChange	(@b);
    125 	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
    126 	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
    127 }
    128 
    129 sub InBasisChange {
    130 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
    132 my @b=@_[0..7];
    133 $code.=<<___;
    134 	pxor	@b[6], @b[5]
    135 	pxor	@b[1], @b[2]
    136 	pxor	@b[0], @b[3]
    137 	pxor	@b[2], @b[6]
    138 	pxor 	@b[0], @b[5]
    139 
    140 	pxor	@b[3], @b[6]
    141 	pxor	@b[7], @b[3]
    142 	pxor	@b[5], @b[7]
    143 	pxor	@b[4], @b[3]
    144 	pxor	@b[5], @b[4]
    145 	pxor	@b[1], @b[3]
    146 
    147 	pxor	@b[7], @b[2]
    148 	pxor	@b[5], @b[1]
    149 ___
    150 }
    151 
    152 sub OutBasisChange {
    153 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
    155 my @b=@_[0..7];
    156 $code.=<<___;
    157 	pxor	@b[6], @b[0]
    158 	pxor	@b[4], @b[1]
    159 	pxor	@b[0], @b[2]
    160 	pxor	@b[6], @b[4]
    161 	pxor	@b[1], @b[6]
    162 
    163 	pxor	@b[5], @b[1]
    164 	pxor	@b[3], @b[5]
    165 	pxor	@b[7], @b[3]
    166 	pxor	@b[5], @b[7]
    167 	pxor	@b[5], @b[2]
    168 
    169 	pxor	@b[7], @b[4]
    170 ___
    171 }
    172 
    173 sub InvSbox {
    174 # input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
    175 # output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
    176 my @b=@_[0..7];
    177 my @t=@_[8..11];
    178 my @s=@_[12..15];
    179 	&InvInBasisChange	(@b);
    180 	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
    181 	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
    182 }
    183 
    184 sub InvInBasisChange {		# OutBasisChange in reverse
    185 my @b=@_[5,1,2,6,3,7,0,4];
    186 $code.=<<___
    187 	pxor	@b[7], @b[4]
    188 
    189 	pxor	@b[5], @b[7]
    190 	pxor	@b[5], @b[2]
    191 	pxor	@b[7], @b[3]
    192 	pxor	@b[3], @b[5]
    193 	pxor	@b[5], @b[1]
    194 
    195 	pxor	@b[1], @b[6]
    196 	pxor	@b[0], @b[2]
    197 	pxor	@b[6], @b[4]
    198 	pxor	@b[6], @b[0]
    199 	pxor	@b[4], @b[1]
    200 ___
    201 }
    202 
    203 sub InvOutBasisChange {		# InBasisChange in reverse
    204 my @b=@_[2,5,7,3,6,1,0,4];
    205 $code.=<<___;
    206 	pxor	@b[5], @b[1]
    207 	pxor	@b[7], @b[2]
    208 
    209 	pxor	@b[1], @b[3]
    210 	pxor	@b[5], @b[4]
    211 	pxor	@b[5], @b[7]
    212 	pxor	@b[4], @b[3]
    213 	 pxor 	@b[0], @b[5]
    214 	pxor	@b[7], @b[3]
    215 	 pxor	@b[2], @b[6]
    216 	 pxor	@b[1], @b[2]
    217 	pxor	@b[3], @b[6]
    218 
    219 	pxor	@b[0], @b[3]
    220 	pxor	@b[6], @b[5]
    221 ___
    222 }
    223 
    224 sub Mul_GF4 {
    225 #;*************************************************************
    226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
    227 #;*************************************************************
    228 my ($x0,$x1,$y0,$y1,$t0)=@_;
    229 $code.=<<___;
    230 	movdqa	$y0, $t0
    231 	pxor 	$y1, $t0
    232 	pand	$x0, $t0
    233 	pxor	$x1, $x0
    234 	pand	$y0, $x1
    235 	pand	$y1, $x0
    236 	pxor	$x1, $x0
    237 	pxor	$t0, $x1
    238 ___
    239 }
    240 
    241 sub Mul_GF4_N {				# not used, see next subroutine
    242 # multiply and scale by N
    243 my ($x0,$x1,$y0,$y1,$t0)=@_;
    244 $code.=<<___;
    245 	movdqa	$y0, $t0
    246 	pxor	$y1, $t0
    247 	pand	$x0, $t0
    248 	pxor	$x1, $x0
    249 	pand	$y0, $x1
    250 	pand	$y1, $x0
    251 	pxor	$x0, $x1
    252 	pxor	$t0, $x0
    253 ___
    254 }
    255 
    256 sub Mul_GF4_N_GF4 {
    257 # interleaved Mul_GF4_N and Mul_GF4
    258 my ($x0,$x1,$y0,$y1,$t0,
    259     $x2,$x3,$y2,$y3,$t1)=@_;
    260 $code.=<<___;
    261 	movdqa	$y0, $t0
    262 	 movdqa	$y2, $t1
    263 	pxor	$y1, $t0
    264 	 pxor 	$y3, $t1
    265 	pand	$x0, $t0
    266 	 pand	$x2, $t1
    267 	pxor	$x1, $x0
    268 	 pxor	$x3, $x2
    269 	pand	$y0, $x1
    270 	 pand	$y2, $x3
    271 	pand	$y1, $x0
    272 	 pand	$y3, $x2
    273 	pxor	$x0, $x1
    274 	 pxor	$x3, $x2
    275 	pxor	$t0, $x0
    276 	 pxor	$t1, $x3
    277 ___
    278 }
    279 sub Mul_GF16_2 {
    280 my @x=@_[0..7];
    281 my @y=@_[8..11];
    282 my @t=@_[12..15];
    283 $code.=<<___;
    284 	movdqa	@x[0], @t[0]
    285 	movdqa	@x[1], @t[1]
    286 ___
    287 	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
    288 $code.=<<___;
    289 	pxor	@x[2], @t[0]
    290 	pxor	@x[3], @t[1]
    291 	pxor	@y[2], @y[0]
    292 	pxor	@y[3], @y[1]
    293 ___
    294 	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
    295 			 @x[2], @x[3], @y[2], @y[3], @t[2]);
    296 $code.=<<___;
    297 	pxor	@t[0], @x[0]
    298 	pxor	@t[0], @x[2]
    299 	pxor	@t[1], @x[1]
    300 	pxor	@t[1], @x[3]
    301 
    302 	movdqa	@x[4], @t[0]
    303 	movdqa	@x[5], @t[1]
    304 	pxor	@x[6], @t[0]
    305 	pxor	@x[7], @t[1]
    306 ___
    307 	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
    308 			 @x[6], @x[7], @y[2], @y[3], @t[2]);
    309 $code.=<<___;
    310 	pxor	@y[2], @y[0]
    311 	pxor	@y[3], @y[1]
    312 ___
    313 	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
    314 $code.=<<___;
    315 	pxor	@t[0], @x[4]
    316 	pxor	@t[0], @x[6]
    317 	pxor	@t[1], @x[5]
    318 	pxor	@t[1], @x[7]
    319 ___
    320 }
    321 sub Inv_GF256 {
    322 #;********************************************************************
    323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
    324 #;********************************************************************
    325 my @x=@_[0..7];
    326 my @t=@_[8..11];
    327 my @s=@_[12..15];
    328 # direct optimizations from hardware
    329 $code.=<<___;
    330 	movdqa	@x[4], @t[3]
    331 	movdqa	@x[5], @t[2]
    332 	movdqa	@x[1], @t[1]
    333 	movdqa	@x[7], @s[1]
    334 	movdqa	@x[0], @s[0]
    335 
    336 	pxor	@x[6], @t[3]
    337 	pxor	@x[7], @t[2]
    338 	pxor	@x[3], @t[1]
    339 	 movdqa	@t[3], @s[2]
    340 	pxor	@x[6], @s[1]
    341 	 movdqa	@t[2], @t[0]
    342 	pxor	@x[2], @s[0]
    343 	 movdqa	@t[3], @s[3]
    344 
    345 	por	@t[1], @t[2]
    346 	por	@s[0], @t[3]
    347 	pxor	@t[0], @s[3]
    348 	pand	@s[0], @s[2]
    349 	pxor	@t[1], @s[0]
    350 	pand	@t[1], @t[0]
    351 	pand	@s[0], @s[3]
    352 	movdqa	@x[3], @s[0]
    353 	pxor	@x[2], @s[0]
    354 	pand	@s[0], @s[1]
    355 	pxor	@s[1], @t[3]
    356 	pxor	@s[1], @t[2]
    357 	movdqa	@x[4], @s[1]
    358 	movdqa	@x[1], @s[0]
    359 	pxor	@x[5], @s[1]
    360 	pxor	@x[0], @s[0]
    361 	movdqa	@s[1], @t[1]
    362 	pand	@s[0], @s[1]
    363 	por	@s[0], @t[1]
    364 	pxor	@s[1], @t[0]
    365 	pxor	@s[3], @t[3]
    366 	pxor	@s[2], @t[2]
    367 	pxor	@s[3], @t[1]
    368 	movdqa	@x[7], @s[0]
    369 	pxor	@s[2], @t[0]
    370 	movdqa	@x[6], @s[1]
    371 	pxor	@s[2], @t[1]
    372 	movdqa	@x[5], @s[2]
    373 	pand	@x[3], @s[0]
    374 	movdqa	@x[4], @s[3]
    375 	pand	@x[2], @s[1]
    376 	pand	@x[1], @s[2]
    377 	por	@x[0], @s[3]
    378 	pxor	@s[0], @t[3]
    379 	pxor	@s[1], @t[2]
    380 	pxor	@s[2], @t[1]
    381 	pxor	@s[3], @t[0] 
    382 
    383 	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
    384 
    385 	# new smaller inversion
    386 
    387 	movdqa	@t[3], @s[0]
    388 	pand	@t[1], @t[3]
    389 	pxor	@t[2], @s[0]
    390 
    391 	movdqa	@t[0], @s[2]
    392 	movdqa	@s[0], @s[3]
    393 	pxor	@t[3], @s[2]
    394 	pand	@s[2], @s[3]
    395 
    396 	movdqa	@t[1], @s[1]
    397 	pxor	@t[2], @s[3]
    398 	pxor	@t[0], @s[1]
    399 
    400 	pxor	@t[2], @t[3]
    401 
    402 	pand	@t[3], @s[1]
    403 
    404 	movdqa	@s[2], @t[2]
    405 	pxor	@t[0], @s[1]
    406 
    407 	pxor	@s[1], @t[2]
    408 	pxor	@s[1], @t[1]
    409 
    410 	pand	@t[0], @t[2]
    411 
    412 	pxor	@t[2], @s[2]
    413 	pxor	@t[2], @t[1]
    414 
    415 	pand	@s[3], @s[2]
    416 
    417 	pxor	@s[0], @s[2]
    418 ___
    419 # output in s3, s2, s1, t1
    420 
    421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
    422 
    423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
    424 	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
    425 
    426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
    427 }
    428 
    429 # AES linear components
    430 
    431 sub ShiftRows {
    432 my @x=@_[0..7];
    433 my $mask=pop;
    434 $code.=<<___;
    435 	pxor	0x00($key),@x[0]
    436 	pxor	0x10($key),@x[1]
    437 	pshufb	$mask,@x[0]
    438 	pxor	0x20($key),@x[2]
    439 	pshufb	$mask,@x[1]
    440 	pxor	0x30($key),@x[3]
    441 	pshufb	$mask,@x[2]
    442 	pxor	0x40($key),@x[4]
    443 	pshufb	$mask,@x[3]
    444 	pxor	0x50($key),@x[5]
    445 	pshufb	$mask,@x[4]
    446 	pxor	0x60($key),@x[6]
    447 	pshufb	$mask,@x[5]
    448 	pxor	0x70($key),@x[7]
    449 	pshufb	$mask,@x[6]
    450 	lea	0x80($key),$key
    451 	pshufb	$mask,@x[7]
    452 ___
    453 }
    454 
    455 sub MixColumns {
    456 # modified to emit output in order suitable for feeding back to aesenc[last]
    457 my @x=@_[0..7];
    458 my @t=@_[8..15];
    459 $code.=<<___;
    460 	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
    461 	pshufd	\$0x93, @x[1], @t[1]
    462 	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
    463 	pshufd	\$0x93, @x[2], @t[2]
    464 	 pxor	@t[1], @x[1]
    465 	pshufd	\$0x93, @x[3], @t[3]
    466 	 pxor	@t[2], @x[2]
    467 	pshufd	\$0x93, @x[4], @t[4]
    468 	 pxor	@t[3], @x[3]
    469 	pshufd	\$0x93, @x[5], @t[5]
    470 	 pxor	@t[4], @x[4]
    471 	pshufd	\$0x93, @x[6], @t[6]
    472 	 pxor	@t[5], @x[5]
    473 	pshufd	\$0x93, @x[7], @t[7]
    474 	 pxor	@t[6], @x[6]
    475 	 pxor	@t[7], @x[7]
    476 
    477 	pxor	@x[0], @t[1]
    478 	pxor	@x[7], @t[0]
    479 	pxor	@x[7], @t[1]
    480 	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
    481 	pxor	@x[1], @t[2]
    482 	 pshufd	\$0x4E, @x[1], @x[1]
    483 	pxor	@x[4], @t[5]
    484 	 pxor	@t[0], @x[0]
    485 	pxor	@x[5], @t[6]
    486 	 pxor	@t[1], @x[1]
    487 	pxor	@x[3], @t[4]
    488 	 pshufd	\$0x4E, @x[4], @t[0]
    489 	pxor	@x[6], @t[7]
    490 	 pshufd	\$0x4E, @x[5], @t[1]
    491 	pxor	@x[2], @t[3]
    492 	 pshufd	\$0x4E, @x[3], @x[4]
    493 	pxor	@x[7], @t[3]
    494 	 pshufd	\$0x4E, @x[7], @x[5]
    495 	pxor	@x[7], @t[4]
    496 	 pshufd	\$0x4E, @x[6], @x[3]
    497 	pxor	@t[4], @t[0]
    498 	 pshufd	\$0x4E, @x[2], @x[6]
    499 	pxor	@t[5], @t[1]
    500 
    501 	pxor	@t[3], @x[4]
    502 	pxor	@t[7], @x[5]
    503 	pxor	@t[6], @x[3]
    504 	 movdqa	@t[0], @x[2]
    505 	pxor	@t[2], @x[6]
    506 	 movdqa	@t[1], @x[7]
    507 ___
    508 }
    509 
    510 sub InvMixColumns {
    511 my @x=@_[0..7];
    512 my @t=@_[8..15];
    513 
    514 $code.=<<___;
    515 	# multiplication by 0x0e
    516 	pshufd	\$0x93, @x[7], @t[7]
    517 	movdqa	@x[2], @t[2]
    518 	pxor	@x[5], @x[7]		# 7 5
    519 	pxor	@x[5], @x[2]		# 2 5
    520 	pshufd	\$0x93, @x[0], @t[0]
    521 	movdqa	@x[5], @t[5]
    522 	pxor	@x[0], @x[5]		# 5 0		[1]
    523 	pxor	@x[1], @x[0]		# 0 1
    524 	pshufd	\$0x93, @x[1], @t[1]
    525 	pxor	@x[2], @x[1]		# 1 25
    526 	pxor	@x[6], @x[0]		# 01 6		[2]
    527 	pxor	@x[3], @x[1]		# 125 3		[4]
    528 	pshufd	\$0x93, @x[3], @t[3]
    529 	pxor	@x[0], @x[2]		# 25 016	[3]
    530 	pxor	@x[7], @x[3]		# 3 75
    531 	pxor	@x[6], @x[7]		# 75 6		[0]
    532 	pshufd	\$0x93, @x[6], @t[6]
    533 	movdqa	@x[4], @t[4]
    534 	pxor	@x[4], @x[6]		# 6 4
    535 	pxor	@x[3], @x[4]		# 4 375		[6]
    536 	pxor	@x[7], @x[3]		# 375 756=36
    537 	pxor	@t[5], @x[6]		# 64 5		[7]
    538 	pxor	@t[2], @x[3]		# 36 2
    539 	pxor	@t[4], @x[3]		# 362 4		[5]
    540 	pshufd	\$0x93, @t[5], @t[5]
    541 ___
    542 					my @y = @x[7,5,0,2,1,3,4,6];
    543 $code.=<<___;
    544 	# multiplication by 0x0b
    545 	pxor	@y[0], @y[1]
    546 	pxor	@t[0], @y[0]
    547 	pxor	@t[1], @y[1]
    548 	pshufd	\$0x93, @t[2], @t[2]
    549 	pxor	@t[5], @y[0]
    550 	pxor	@t[6], @y[1]
    551 	pxor	@t[7], @y[0]
    552 	pshufd	\$0x93, @t[4], @t[4]
    553 	pxor	@t[6], @t[7]		# clobber t[7]
    554 	pxor	@y[0], @y[1]
    555 
    556 	pxor	@t[0], @y[3]
    557 	pshufd	\$0x93, @t[0], @t[0]
    558 	pxor	@t[1], @y[2]
    559 	pxor	@t[1], @y[4]
    560 	pxor	@t[2], @y[2]
    561 	pshufd	\$0x93, @t[1], @t[1]
    562 	pxor	@t[2], @y[3]
    563 	pxor	@t[2], @y[5]
    564 	pxor	@t[7], @y[2]
    565 	pshufd	\$0x93, @t[2], @t[2]
    566 	pxor	@t[3], @y[3]
    567 	pxor	@t[3], @y[6]
    568 	pxor	@t[3], @y[4]
    569 	pshufd	\$0x93, @t[3], @t[3]
    570 	pxor	@t[4], @y[7]
    571 	pxor	@t[4], @y[5]
    572 	pxor	@t[7], @y[7]
    573 	pxor	@t[5], @y[3]
    574 	pxor	@t[4], @y[4]
    575 	pxor	@t[5], @t[7]		# clobber t[7] even more
    576 
    577 	pxor	@t[7], @y[5]
    578 	pshufd	\$0x93, @t[4], @t[4]
    579 	pxor	@t[7], @y[6]
    580 	pxor	@t[7], @y[4]
    581 
    582 	pxor	@t[5], @t[7]
    583 	pshufd	\$0x93, @t[5], @t[5]
    584 	pxor	@t[6], @t[7]		# restore t[7]
    585 
    586 	# multiplication by 0x0d
    587 	pxor	@y[7], @y[4]
    588 	pxor	@t[4], @y[7]
    589 	pshufd	\$0x93, @t[6], @t[6]
    590 	pxor	@t[0], @y[2]
    591 	pxor	@t[5], @y[7]
    592 	pxor	@t[2], @y[2]
    593 	pshufd	\$0x93, @t[7], @t[7]
    594 
    595 	pxor	@y[1], @y[3]
    596 	pxor	@t[1], @y[1]
    597 	pxor	@t[0], @y[0]
    598 	pxor	@t[0], @y[3]
    599 	pxor	@t[5], @y[1]
    600 	pxor	@t[5], @y[0]
    601 	pxor	@t[7], @y[1]
    602 	pshufd	\$0x93, @t[0], @t[0]
    603 	pxor	@t[6], @y[0]
    604 	pxor	@y[1], @y[3]
    605 	pxor	@t[1], @y[4]
    606 	pshufd	\$0x93, @t[1], @t[1]
    607 
    608 	pxor	@t[7], @y[7]
    609 	pxor	@t[2], @y[4]
    610 	pxor	@t[2], @y[5]
    611 	pshufd	\$0x93, @t[2], @t[2]
    612 	pxor	@t[6], @y[2]
    613 	pxor	@t[3], @t[6]		# clobber t[6]
    614 	pxor	@y[7], @y[4]
    615 	pxor	@t[6], @y[3]
    616 
    617 	pxor	@t[6], @y[6]
    618 	pxor	@t[5], @y[5]
    619 	pxor	@t[4], @y[6]
    620 	pshufd	\$0x93, @t[4], @t[4]
    621 	pxor	@t[6], @y[5]
    622 	pxor	@t[7], @y[6]
    623 	pxor	@t[3], @t[6]		# restore t[6]
    624 
    625 	pshufd	\$0x93, @t[5], @t[5]
    626 	pshufd	\$0x93, @t[6], @t[6]
    627 	pshufd	\$0x93, @t[7], @t[7]
    628 	pshufd	\$0x93, @t[3], @t[3]
    629 
    630 	# multiplication by 0x09
    631 	pxor	@y[1], @y[4]
    632 	pxor	@y[1], @t[1]		# t[1]=y[1]
    633 	pxor	@t[5], @t[0]		# clobber t[0]
    634 	pxor	@t[5], @t[1]
    635 	pxor	@t[0], @y[3]
    636 	pxor	@y[0], @t[0]		# t[0]=y[0]
    637 	pxor	@t[6], @t[1]
    638 	pxor	@t[7], @t[6]		# clobber t[6]
    639 	pxor	@t[1], @y[4]
    640 	pxor	@t[4], @y[7]
    641 	pxor	@y[4], @t[4]		# t[4]=y[4]
    642 	pxor	@t[3], @y[6]
    643 	pxor	@y[3], @t[3]		# t[3]=y[3]
    644 	pxor	@t[2], @y[5]
    645 	pxor	@y[2], @t[2]		# t[2]=y[2]
    646 	pxor	@t[7], @t[3]
    647 	pxor	@y[5], @t[5]		# t[5]=y[5]
    648 	pxor	@t[6], @t[2]
    649 	pxor	@t[6], @t[5]
    650 	pxor	@y[6], @t[6]		# t[6]=y[6]
    651 	pxor	@y[7], @t[7]		# t[7]=y[7]
    652 
    653 	movdqa	@t[0],@XMM[0]
    654 	movdqa	@t[1],@XMM[1]
    655 	movdqa	@t[2],@XMM[2]
    656 	movdqa	@t[3],@XMM[3]
    657 	movdqa	@t[4],@XMM[4]
    658 	movdqa	@t[5],@XMM[5]
    659 	movdqa	@t[6],@XMM[6]
    660 	movdqa	@t[7],@XMM[7]
    661 ___
    662 }
    663 
    664 sub aesenc {				# not used
    665 my @b=@_[0..7];
    666 my @t=@_[8..15];
    667 $code.=<<___;
    668 	movdqa	0x30($const),@t[0]	# .LSR
    669 ___
    670 	&ShiftRows	(@b,@t[0]);
    671 	&Sbox		(@b,@t);
    672 	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
    673 }
    674 
    675 sub aesenclast {			# not used
    676 my @b=@_[0..7];
    677 my @t=@_[8..15];
    678 $code.=<<___;
    679 	movdqa	0x40($const),@t[0]	# .LSRM0
    680 ___
    681 	&ShiftRows	(@b,@t[0]);
    682 	&Sbox		(@b,@t);
    683 $code.=<<___
    684 	pxor	0x00($key),@b[0]
    685 	pxor	0x10($key),@b[1]
    686 	pxor	0x20($key),@b[4]
    687 	pxor	0x30($key),@b[6]
    688 	pxor	0x40($key),@b[3]
    689 	pxor	0x50($key),@b[7]
    690 	pxor	0x60($key),@b[2]
    691 	pxor	0x70($key),@b[5]
    692 ___
    693 }
    694 
    695 sub swapmove {
    696 my ($a,$b,$n,$mask,$t)=@_;
    697 $code.=<<___;
    698 	movdqa	$b,$t
    699 	psrlq	\$$n,$b
    700 	pxor  	$a,$b
    701 	pand	$mask,$b
    702 	pxor	$b,$a
    703 	psllq	\$$n,$b
    704 	pxor	$t,$b
    705 ___
    706 }
    707 sub swapmove2x {
    708 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
    709 $code.=<<___;
    710 	movdqa	$b0,$t0
    711 	psrlq	\$$n,$b0
    712 	 movdqa	$b1,$t1
    713 	 psrlq	\$$n,$b1
    714 	pxor  	$a0,$b0
    715 	 pxor  	$a1,$b1
    716 	pand	$mask,$b0
    717 	 pand	$mask,$b1
    718 	pxor	$b0,$a0
    719 	psllq	\$$n,$b0
    720 	 pxor	$b1,$a1
    721 	 psllq	\$$n,$b1
    722 	pxor	$t0,$b0
    723 	 pxor	$t1,$b1
    724 ___
    725 }
    726 
    727 sub bitslice {
    728 my @x=reverse(@_[0..7]);
    729 my ($t0,$t1,$t2,$t3)=@_[8..11];
    730 $code.=<<___;
    731 	movdqa	0x00($const),$t0	# .LBS0
    732 	movdqa	0x10($const),$t1	# .LBS1
    733 ___
    734 	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
    735 	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
    736 $code.=<<___;
    737 	movdqa	0x20($const),$t0	# .LBS2
    738 ___
    739 	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
    740 	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
    741 
    742 	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
    743 	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
    744 }
    745 
    746 $code.=<<___;
    747 .text
    748 
    749 .extern	asm_AES_encrypt
    750 .extern	asm_AES_decrypt
    751 
    752 .type	_bsaes_encrypt8,\@abi-omnipotent
    753 .align	64
    754 _bsaes_encrypt8:
    755 	lea	.LBS0(%rip), $const	# constants table
    756 
    757 	movdqa	($key), @XMM[9]		# round 0 key
    758 	lea	0x10($key), $key
    759 	movdqa	0x50($const), @XMM[8]	# .LM0SR
    760 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
    761 	pxor	@XMM[9], @XMM[1]
    762 	 pshufb	@XMM[8], @XMM[0]
    763 	pxor	@XMM[9], @XMM[2]
    764 	 pshufb	@XMM[8], @XMM[1]
    765 	pxor	@XMM[9], @XMM[3]
    766 	 pshufb	@XMM[8], @XMM[2]
    767 	pxor	@XMM[9], @XMM[4]
    768 	 pshufb	@XMM[8], @XMM[3]
    769 	pxor	@XMM[9], @XMM[5]
    770 	 pshufb	@XMM[8], @XMM[4]
    771 	pxor	@XMM[9], @XMM[6]
    772 	 pshufb	@XMM[8], @XMM[5]
    773 	pxor	@XMM[9], @XMM[7]
    774 	 pshufb	@XMM[8], @XMM[6]
    775 	 pshufb	@XMM[8], @XMM[7]
    776 _bsaes_encrypt8_bitslice:
    777 ___
    778 	&bitslice	(@XMM[0..7, 8..11]);
    779 $code.=<<___;
    780 	dec	$rounds
    781 	jmp	.Lenc_sbox
    782 .align	16
    783 .Lenc_loop:
    784 ___
    785 	&ShiftRows	(@XMM[0..7, 8]);
    786 $code.=".Lenc_sbox:\n";
    787 	&Sbox		(@XMM[0..7, 8..15]);
    788 $code.=<<___;
    789 	dec	$rounds
    790 	jl	.Lenc_done
    791 ___
    792 	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
    793 $code.=<<___;
    794 	movdqa	0x30($const), @XMM[8]	# .LSR
    795 	jnz	.Lenc_loop
    796 	movdqa	0x40($const), @XMM[8]	# .LSRM0
    797 	jmp	.Lenc_loop
    798 .align	16
    799 .Lenc_done:
    800 ___
    801 	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
    802 	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
    803 $code.=<<___;
    804 	movdqa	($key), @XMM[8]		# last round key
    805 	pxor	@XMM[8], @XMM[4]
    806 	pxor	@XMM[8], @XMM[6]
    807 	pxor	@XMM[8], @XMM[3]
    808 	pxor	@XMM[8], @XMM[7]
    809 	pxor	@XMM[8], @XMM[2]
    810 	pxor	@XMM[8], @XMM[5]
    811 	pxor	@XMM[8], @XMM[0]
    812 	pxor	@XMM[8], @XMM[1]
    813 	ret
    814 .size	_bsaes_encrypt8,.-_bsaes_encrypt8
    815 
    816 .type	_bsaes_decrypt8,\@abi-omnipotent
    817 .align	64
    818 _bsaes_decrypt8:
    819 	lea	.LBS0(%rip), $const	# constants table
    820 
    821 	movdqa	($key), @XMM[9]		# round 0 key
    822 	lea	0x10($key), $key
    823 	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
    824 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
    825 	pxor	@XMM[9], @XMM[1]
    826 	 pshufb	@XMM[8], @XMM[0]
    827 	pxor	@XMM[9], @XMM[2]
    828 	 pshufb	@XMM[8], @XMM[1]
    829 	pxor	@XMM[9], @XMM[3]
    830 	 pshufb	@XMM[8], @XMM[2]
    831 	pxor	@XMM[9], @XMM[4]
    832 	 pshufb	@XMM[8], @XMM[3]
    833 	pxor	@XMM[9], @XMM[5]
    834 	 pshufb	@XMM[8], @XMM[4]
    835 	pxor	@XMM[9], @XMM[6]
    836 	 pshufb	@XMM[8], @XMM[5]
    837 	pxor	@XMM[9], @XMM[7]
    838 	 pshufb	@XMM[8], @XMM[6]
    839 	 pshufb	@XMM[8], @XMM[7]
    840 ___
    841 	&bitslice	(@XMM[0..7, 8..11]);
    842 $code.=<<___;
    843 	dec	$rounds
    844 	jmp	.Ldec_sbox
    845 .align	16
    846 .Ldec_loop:
    847 ___
    848 	&ShiftRows	(@XMM[0..7, 8]);
    849 $code.=".Ldec_sbox:\n";
    850 	&InvSbox	(@XMM[0..7, 8..15]);
    851 $code.=<<___;
    852 	dec	$rounds
    853 	jl	.Ldec_done
    854 ___
    855 	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
    856 $code.=<<___;
    857 	movdqa	-0x10($const), @XMM[8]	# .LISR
    858 	jnz	.Ldec_loop
    859 	movdqa	-0x20($const), @XMM[8]	# .LISRM0
    860 	jmp	.Ldec_loop
    861 .align	16
    862 .Ldec_done:
    863 ___
    864 	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
    865 $code.=<<___;
    866 	movdqa	($key), @XMM[8]		# last round key
    867 	pxor	@XMM[8], @XMM[6]
    868 	pxor	@XMM[8], @XMM[4]
    869 	pxor	@XMM[8], @XMM[2]
    870 	pxor	@XMM[8], @XMM[7]
    871 	pxor	@XMM[8], @XMM[3]
    872 	pxor	@XMM[8], @XMM[5]
    873 	pxor	@XMM[8], @XMM[0]
    874 	pxor	@XMM[8], @XMM[1]
    875 	ret
    876 .size	_bsaes_decrypt8,.-_bsaes_decrypt8
    877 ___
    878 }
    879 {
    880 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
    881 
    882 sub bitslice_key {
    883 my @x=reverse(@_[0..7]);
    884 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
    885 
    886 	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
    887 $code.=<<___;
    888 	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
    889 	movdqa	@x[0], @x[2]
    890 	movdqa	@x[1], @x[3]
    891 ___
    892 	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
    893 
    894 	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
    895 $code.=<<___;
    896 	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
    897 	movdqa	@x[0], @x[4]
    898 	movdqa	@x[2], @x[6]
    899 	movdqa	@x[1], @x[5]
    900 	movdqa	@x[3], @x[7]
    901 ___
    902 	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
    903 	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
    904 }
    905 
    906 $code.=<<___;
    907 .type	_bsaes_key_convert,\@abi-omnipotent
    908 .align	16
    909 _bsaes_key_convert:
    910 	lea	.Lmasks(%rip), $const
    911 	movdqu	($inp), %xmm7		# load round 0 key
    912 	lea	0x10($inp), $inp
    913 	movdqa	0x00($const), %xmm0	# 0x01...
    914 	movdqa	0x10($const), %xmm1	# 0x02...
    915 	movdqa	0x20($const), %xmm2	# 0x04...
    916 	movdqa	0x30($const), %xmm3	# 0x08...
    917 	movdqa	0x40($const), %xmm4	# .LM0
    918 	pcmpeqd	%xmm5, %xmm5		# .LNOT
    919 
    920 	movdqu	($inp), %xmm6		# load round 1 key
    921 	movdqa	%xmm7, ($out)		# save round 0 key
    922 	lea	0x10($out), $out
    923 	dec	$rounds
    924 	jmp	.Lkey_loop
    925 .align	16
    926 .Lkey_loop:
    927 	pshufb	%xmm4, %xmm6		# .LM0
    928 
    929 	movdqa	%xmm0,	%xmm8
    930 	movdqa	%xmm1,	%xmm9
    931 
    932 	pand	%xmm6,	%xmm8
    933 	pand	%xmm6,	%xmm9
    934 	movdqa	%xmm2,	%xmm10
    935 	pcmpeqb	%xmm0,	%xmm8
    936 	psllq	\$4,	%xmm0		# 0x10...
    937 	movdqa	%xmm3,	%xmm11
    938 	pcmpeqb	%xmm1,	%xmm9
    939 	psllq	\$4,	%xmm1		# 0x20...
    940 
    941 	pand	%xmm6,	%xmm10
    942 	pand	%xmm6,	%xmm11
    943 	movdqa	%xmm0,	%xmm12
    944 	pcmpeqb	%xmm2,	%xmm10
    945 	psllq	\$4,	%xmm2		# 0x40...
    946 	movdqa	%xmm1,	%xmm13
    947 	pcmpeqb	%xmm3,	%xmm11
    948 	psllq	\$4,	%xmm3		# 0x80...
    949 
    950 	movdqa	%xmm2,	%xmm14
    951 	movdqa	%xmm3,	%xmm15
    952 	 pxor	%xmm5,	%xmm8		# "pnot"
    953 	 pxor	%xmm5,	%xmm9
    954 
    955 	pand	%xmm6,	%xmm12
    956 	pand	%xmm6,	%xmm13
    957 	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
    958 	pcmpeqb	%xmm0,	%xmm12
    959 	psrlq	\$4,	%xmm0		# 0x01...
    960 	 movdqa	%xmm9, 0x10($out)
    961 	pcmpeqb	%xmm1,	%xmm13
    962 	psrlq	\$4,	%xmm1		# 0x02...
    963 	 lea	0x10($inp), $inp
    964 
    965 	pand	%xmm6,	%xmm14
    966 	pand	%xmm6,	%xmm15
    967 	 movdqa	%xmm10, 0x20($out)
    968 	pcmpeqb	%xmm2,	%xmm14
    969 	psrlq	\$4,	%xmm2		# 0x04...
    970 	 movdqa	%xmm11, 0x30($out)
    971 	pcmpeqb	%xmm3,	%xmm15
    972 	psrlq	\$4,	%xmm3		# 0x08...
    973 	 movdqu	($inp), %xmm6		# load next round key
    974 
    975 	pxor	%xmm5, %xmm13		# "pnot"
    976 	pxor	%xmm5, %xmm14
    977 	movdqa	%xmm12, 0x40($out)
    978 	movdqa	%xmm13, 0x50($out)
    979 	movdqa	%xmm14, 0x60($out)
    980 	movdqa	%xmm15, 0x70($out)
    981 	lea	0x80($out),$out
    982 	dec	$rounds
    983 	jnz	.Lkey_loop
    984 
    985 	movdqa	0x50($const), %xmm7	# .L63
    986 	#movdqa	%xmm6, ($out)		# don't save last round key
    987 	ret
    988 .size	_bsaes_key_convert,.-_bsaes_key_convert
    989 ___
    990 }
    991 
    992 if (0 && !$win64) {	# following four functions are unsupported interface
    993 			# used for benchmarking...
    994 $code.=<<___;
    995 .globl	bsaes_enc_key_convert
    996 .type	bsaes_enc_key_convert,\@function,2
    997 .align	16
    998 bsaes_enc_key_convert:
    999 	mov	240($inp),%r10d		# pass rounds
   1000 	mov	$inp,%rcx		# pass key
   1001 	mov	$out,%rax		# pass key schedule
   1002 	call	_bsaes_key_convert
   1003 	pxor	%xmm6,%xmm7		# fix up last round key
   1004 	movdqa	%xmm7,(%rax)		# save last round key
   1005 	ret
   1006 .size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
   1007 
   1008 .globl	bsaes_encrypt_128
   1009 .type	bsaes_encrypt_128,\@function,4
   1010 .align	16
   1011 bsaes_encrypt_128:
   1012 .Lenc128_loop:
   1013 	movdqu	0x00($inp), @XMM[0]	# load input
   1014 	movdqu	0x10($inp), @XMM[1]
   1015 	movdqu	0x20($inp), @XMM[2]
   1016 	movdqu	0x30($inp), @XMM[3]
   1017 	movdqu	0x40($inp), @XMM[4]
   1018 	movdqu	0x50($inp), @XMM[5]
   1019 	movdqu	0x60($inp), @XMM[6]
   1020 	movdqu	0x70($inp), @XMM[7]
   1021 	mov	$key, %rax		# pass the $key
   1022 	lea	0x80($inp), $inp
   1023 	mov	\$10,%r10d
   1024 
   1025 	call	_bsaes_encrypt8
   1026 
   1027 	movdqu	@XMM[0], 0x00($out)	# write output
   1028 	movdqu	@XMM[1], 0x10($out)
   1029 	movdqu	@XMM[4], 0x20($out)
   1030 	movdqu	@XMM[6], 0x30($out)
   1031 	movdqu	@XMM[3], 0x40($out)
   1032 	movdqu	@XMM[7], 0x50($out)
   1033 	movdqu	@XMM[2], 0x60($out)
   1034 	movdqu	@XMM[5], 0x70($out)
   1035 	lea	0x80($out), $out
   1036 	sub	\$0x80,$len
   1037 	ja	.Lenc128_loop
   1038 	ret
   1039 .size	bsaes_encrypt_128,.-bsaes_encrypt_128
   1040 
   1041 .globl	bsaes_dec_key_convert
   1042 .type	bsaes_dec_key_convert,\@function,2
   1043 .align	16
   1044 bsaes_dec_key_convert:
   1045 	mov	240($inp),%r10d		# pass rounds
   1046 	mov	$inp,%rcx		# pass key
   1047 	mov	$out,%rax		# pass key schedule
   1048 	call	_bsaes_key_convert
   1049 	pxor	($out),%xmm7		# fix up round 0 key
   1050 	movdqa	%xmm6,(%rax)		# save last round key
   1051 	movdqa	%xmm7,($out)
   1052 	ret
   1053 .size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
   1054 
   1055 .globl	bsaes_decrypt_128
   1056 .type	bsaes_decrypt_128,\@function,4
   1057 .align	16
   1058 bsaes_decrypt_128:
   1059 .Ldec128_loop:
   1060 	movdqu	0x00($inp), @XMM[0]	# load input
   1061 	movdqu	0x10($inp), @XMM[1]
   1062 	movdqu	0x20($inp), @XMM[2]
   1063 	movdqu	0x30($inp), @XMM[3]
   1064 	movdqu	0x40($inp), @XMM[4]
   1065 	movdqu	0x50($inp), @XMM[5]
   1066 	movdqu	0x60($inp), @XMM[6]
   1067 	movdqu	0x70($inp), @XMM[7]
   1068 	mov	$key, %rax		# pass the $key
   1069 	lea	0x80($inp), $inp
   1070 	mov	\$10,%r10d
   1071 
   1072 	call	_bsaes_decrypt8
   1073 
   1074 	movdqu	@XMM[0], 0x00($out)	# write output
   1075 	movdqu	@XMM[1], 0x10($out)
   1076 	movdqu	@XMM[6], 0x20($out)
   1077 	movdqu	@XMM[4], 0x30($out)
   1078 	movdqu	@XMM[2], 0x40($out)
   1079 	movdqu	@XMM[7], 0x50($out)
   1080 	movdqu	@XMM[3], 0x60($out)
   1081 	movdqu	@XMM[5], 0x70($out)
   1082 	lea	0x80($out), $out
   1083 	sub	\$0x80,$len
   1084 	ja	.Ldec128_loop
   1085 	ret
   1086 .size	bsaes_decrypt_128,.-bsaes_decrypt_128
   1087 ___
   1088 }
   1089 {
   1090 ######################################################################
   1091 #
   1092 # OpenSSL interface
   1093 #
   1094 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
   1095 						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
   1096 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
   1097 
   1098 if ($ecb) {
   1099 $code.=<<___;
   1100 .globl	bsaes_ecb_encrypt_blocks
   1101 .type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
   1102 .align	16
   1103 bsaes_ecb_encrypt_blocks:
   1104 	mov	%rsp, %rax
   1105 .Lecb_enc_prologue:
   1106 	push	%rbp
   1107 	push	%rbx
   1108 	push	%r12
   1109 	push	%r13
   1110 	push	%r14
   1111 	push	%r15
   1112 	lea	-0x48(%rsp),%rsp
   1113 ___
   1114 $code.=<<___ if ($win64);
   1115 	lea	-0xa0(%rsp), %rsp
   1116 	movaps	%xmm6, 0x40(%rsp)
   1117 	movaps	%xmm7, 0x50(%rsp)
   1118 	movaps	%xmm8, 0x60(%rsp)
   1119 	movaps	%xmm9, 0x70(%rsp)
   1120 	movaps	%xmm10, 0x80(%rsp)
   1121 	movaps	%xmm11, 0x90(%rsp)
   1122 	movaps	%xmm12, 0xa0(%rsp)
   1123 	movaps	%xmm13, 0xb0(%rsp)
   1124 	movaps	%xmm14, 0xc0(%rsp)
   1125 	movaps	%xmm15, 0xd0(%rsp)
   1126 .Lecb_enc_body:
   1127 ___
   1128 $code.=<<___;
   1129 	mov	%rsp,%rbp		# backup %rsp
   1130 	mov	240($arg4),%eax		# rounds
   1131 	mov	$arg1,$inp		# backup arguments
   1132 	mov	$arg2,$out
   1133 	mov	$arg3,$len
   1134 	mov	$arg4,$key
   1135 	cmp	\$8,$arg3
   1136 	jb	.Lecb_enc_short
   1137 
   1138 	mov	%eax,%ebx		# backup rounds
   1139 	shl	\$7,%rax		# 128 bytes per inner round key
   1140 	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
   1141 	sub	%rax,%rsp
   1142 	mov	%rsp,%rax		# pass key schedule
   1143 	mov	$key,%rcx		# pass key
   1144 	mov	%ebx,%r10d		# pass rounds
   1145 	call	_bsaes_key_convert
   1146 	pxor	%xmm6,%xmm7		# fix up last round key
   1147 	movdqa	%xmm7,(%rax)		# save last round key
   1148 
   1149 	sub	\$8,$len
   1150 .Lecb_enc_loop:
   1151 	movdqu	0x00($inp), @XMM[0]	# load input
   1152 	movdqu	0x10($inp), @XMM[1]
   1153 	movdqu	0x20($inp), @XMM[2]
   1154 	movdqu	0x30($inp), @XMM[3]
   1155 	movdqu	0x40($inp), @XMM[4]
   1156 	movdqu	0x50($inp), @XMM[5]
   1157 	mov	%rsp, %rax		# pass key schedule
   1158 	movdqu	0x60($inp), @XMM[6]
   1159 	mov	%ebx,%r10d		# pass rounds
   1160 	movdqu	0x70($inp), @XMM[7]
   1161 	lea	0x80($inp), $inp
   1162 
   1163 	call	_bsaes_encrypt8
   1164 
   1165 	movdqu	@XMM[0], 0x00($out)	# write output
   1166 	movdqu	@XMM[1], 0x10($out)
   1167 	movdqu	@XMM[4], 0x20($out)
   1168 	movdqu	@XMM[6], 0x30($out)
   1169 	movdqu	@XMM[3], 0x40($out)
   1170 	movdqu	@XMM[7], 0x50($out)
   1171 	movdqu	@XMM[2], 0x60($out)
   1172 	movdqu	@XMM[5], 0x70($out)
   1173 	lea	0x80($out), $out
   1174 	sub	\$8,$len
   1175 	jnc	.Lecb_enc_loop
   1176 
   1177 	add	\$8,$len
   1178 	jz	.Lecb_enc_done
   1179 
   1180 	movdqu	0x00($inp), @XMM[0]	# load input
   1181 	mov	%rsp, %rax		# pass key schedule
   1182 	mov	%ebx,%r10d		# pass rounds
   1183 	cmp	\$2,$len
   1184 	jb	.Lecb_enc_one
   1185 	movdqu	0x10($inp), @XMM[1]
   1186 	je	.Lecb_enc_two
   1187 	movdqu	0x20($inp), @XMM[2]
   1188 	cmp	\$4,$len
   1189 	jb	.Lecb_enc_three
   1190 	movdqu	0x30($inp), @XMM[3]
   1191 	je	.Lecb_enc_four
   1192 	movdqu	0x40($inp), @XMM[4]
   1193 	cmp	\$6,$len
   1194 	jb	.Lecb_enc_five
   1195 	movdqu	0x50($inp), @XMM[5]
   1196 	je	.Lecb_enc_six
   1197 	movdqu	0x60($inp), @XMM[6]
   1198 	call	_bsaes_encrypt8
   1199 	movdqu	@XMM[0], 0x00($out)	# write output
   1200 	movdqu	@XMM[1], 0x10($out)
   1201 	movdqu	@XMM[4], 0x20($out)
   1202 	movdqu	@XMM[6], 0x30($out)
   1203 	movdqu	@XMM[3], 0x40($out)
   1204 	movdqu	@XMM[7], 0x50($out)
   1205 	movdqu	@XMM[2], 0x60($out)
   1206 	jmp	.Lecb_enc_done
   1207 .align	16
   1208 .Lecb_enc_six:
   1209 	call	_bsaes_encrypt8
   1210 	movdqu	@XMM[0], 0x00($out)	# write output
   1211 	movdqu	@XMM[1], 0x10($out)
   1212 	movdqu	@XMM[4], 0x20($out)
   1213 	movdqu	@XMM[6], 0x30($out)
   1214 	movdqu	@XMM[3], 0x40($out)
   1215 	movdqu	@XMM[7], 0x50($out)
   1216 	jmp	.Lecb_enc_done
   1217 .align	16
   1218 .Lecb_enc_five:
   1219 	call	_bsaes_encrypt8
   1220 	movdqu	@XMM[0], 0x00($out)	# write output
   1221 	movdqu	@XMM[1], 0x10($out)
   1222 	movdqu	@XMM[4], 0x20($out)
   1223 	movdqu	@XMM[6], 0x30($out)
   1224 	movdqu	@XMM[3], 0x40($out)
   1225 	jmp	.Lecb_enc_done
   1226 .align	16
   1227 .Lecb_enc_four:
   1228 	call	_bsaes_encrypt8
   1229 	movdqu	@XMM[0], 0x00($out)	# write output
   1230 	movdqu	@XMM[1], 0x10($out)
   1231 	movdqu	@XMM[4], 0x20($out)
   1232 	movdqu	@XMM[6], 0x30($out)
   1233 	jmp	.Lecb_enc_done
   1234 .align	16
   1235 .Lecb_enc_three:
   1236 	call	_bsaes_encrypt8
   1237 	movdqu	@XMM[0], 0x00($out)	# write output
   1238 	movdqu	@XMM[1], 0x10($out)
   1239 	movdqu	@XMM[4], 0x20($out)
   1240 	jmp	.Lecb_enc_done
   1241 .align	16
   1242 .Lecb_enc_two:
   1243 	call	_bsaes_encrypt8
   1244 	movdqu	@XMM[0], 0x00($out)	# write output
   1245 	movdqu	@XMM[1], 0x10($out)
   1246 	jmp	.Lecb_enc_done
   1247 .align	16
   1248 .Lecb_enc_one:
   1249 	call	_bsaes_encrypt8
   1250 	movdqu	@XMM[0], 0x00($out)	# write output
   1251 	jmp	.Lecb_enc_done
   1252 .align	16
   1253 .Lecb_enc_short:
   1254 	lea	($inp), $arg1
   1255 	lea	($out), $arg2
   1256 	lea	($key), $arg3
   1257 	call	asm_AES_encrypt
   1258 	lea	16($inp), $inp
   1259 	lea	16($out), $out
   1260 	dec	$len
   1261 	jnz	.Lecb_enc_short
   1262 
   1263 .Lecb_enc_done:
   1264 	lea	(%rsp),%rax
   1265 	pxor	%xmm0, %xmm0
   1266 .Lecb_enc_bzero:			# wipe key schedule [if any]
   1267 	movdqa	%xmm0, 0x00(%rax)
   1268 	movdqa	%xmm0, 0x10(%rax)
   1269 	lea	0x20(%rax), %rax
   1270 	cmp	%rax, %rbp
   1271 	jb	.Lecb_enc_bzero
   1272 
   1273 	lea	(%rbp),%rsp		# restore %rsp
   1274 ___
   1275 $code.=<<___ if ($win64);
   1276 	movaps	0x40(%rbp), %xmm6
   1277 	movaps	0x50(%rbp), %xmm7
   1278 	movaps	0x60(%rbp), %xmm8
   1279 	movaps	0x70(%rbp), %xmm9
   1280 	movaps	0x80(%rbp), %xmm10
   1281 	movaps	0x90(%rbp), %xmm11
   1282 	movaps	0xa0(%rbp), %xmm12
   1283 	movaps	0xb0(%rbp), %xmm13
   1284 	movaps	0xc0(%rbp), %xmm14
   1285 	movaps	0xd0(%rbp), %xmm15
   1286 	lea	0xa0(%rbp), %rsp
   1287 ___
   1288 $code.=<<___;
   1289 	mov	0x48(%rsp), %r15
   1290 	mov	0x50(%rsp), %r14
   1291 	mov	0x58(%rsp), %r13
   1292 	mov	0x60(%rsp), %r12
   1293 	mov	0x68(%rsp), %rbx
   1294 	mov	0x70(%rsp), %rax
   1295 	lea	0x78(%rsp), %rsp
   1296 	mov	%rax, %rbp
   1297 .Lecb_enc_epilogue:
   1298 	ret
   1299 .size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
   1300 
   1301 .globl	bsaes_ecb_decrypt_blocks
   1302 .type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
   1303 .align	16
   1304 bsaes_ecb_decrypt_blocks:
   1305 	mov	%rsp, %rax
   1306 .Lecb_dec_prologue:
   1307 	push	%rbp
   1308 	push	%rbx
   1309 	push	%r12
   1310 	push	%r13
   1311 	push	%r14
   1312 	push	%r15
   1313 	lea	-0x48(%rsp),%rsp
   1314 ___
   1315 $code.=<<___ if ($win64);
   1316 	lea	-0xa0(%rsp), %rsp
   1317 	movaps	%xmm6, 0x40(%rsp)
   1318 	movaps	%xmm7, 0x50(%rsp)
   1319 	movaps	%xmm8, 0x60(%rsp)
   1320 	movaps	%xmm9, 0x70(%rsp)
   1321 	movaps	%xmm10, 0x80(%rsp)
   1322 	movaps	%xmm11, 0x90(%rsp)
   1323 	movaps	%xmm12, 0xa0(%rsp)
   1324 	movaps	%xmm13, 0xb0(%rsp)
   1325 	movaps	%xmm14, 0xc0(%rsp)
   1326 	movaps	%xmm15, 0xd0(%rsp)
   1327 .Lecb_dec_body:
   1328 ___
   1329 $code.=<<___;
   1330 	mov	%rsp,%rbp		# backup %rsp
   1331 	mov	240($arg4),%eax		# rounds
   1332 	mov	$arg1,$inp		# backup arguments
   1333 	mov	$arg2,$out
   1334 	mov	$arg3,$len
   1335 	mov	$arg4,$key
   1336 	cmp	\$8,$arg3
   1337 	jb	.Lecb_dec_short
   1338 
   1339 	mov	%eax,%ebx		# backup rounds
   1340 	shl	\$7,%rax		# 128 bytes per inner round key
   1341 	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
   1342 	sub	%rax,%rsp
   1343 	mov	%rsp,%rax		# pass key schedule
   1344 	mov	$key,%rcx		# pass key
   1345 	mov	%ebx,%r10d		# pass rounds
   1346 	call	_bsaes_key_convert
   1347 	pxor	(%rsp),%xmm7		# fix up 0 round key
   1348 	movdqa	%xmm6,(%rax)		# save last round key
   1349 	movdqa	%xmm7,(%rsp)
   1350 
   1351 	sub	\$8,$len
   1352 .Lecb_dec_loop:
   1353 	movdqu	0x00($inp), @XMM[0]	# load input
   1354 	movdqu	0x10($inp), @XMM[1]
   1355 	movdqu	0x20($inp), @XMM[2]
   1356 	movdqu	0x30($inp), @XMM[3]
   1357 	movdqu	0x40($inp), @XMM[4]
   1358 	movdqu	0x50($inp), @XMM[5]
   1359 	mov	%rsp, %rax		# pass key schedule
   1360 	movdqu	0x60($inp), @XMM[6]
   1361 	mov	%ebx,%r10d		# pass rounds
   1362 	movdqu	0x70($inp), @XMM[7]
   1363 	lea	0x80($inp), $inp
   1364 
   1365 	call	_bsaes_decrypt8
   1366 
   1367 	movdqu	@XMM[0], 0x00($out)	# write output
   1368 	movdqu	@XMM[1], 0x10($out)
   1369 	movdqu	@XMM[6], 0x20($out)
   1370 	movdqu	@XMM[4], 0x30($out)
   1371 	movdqu	@XMM[2], 0x40($out)
   1372 	movdqu	@XMM[7], 0x50($out)
   1373 	movdqu	@XMM[3], 0x60($out)
   1374 	movdqu	@XMM[5], 0x70($out)
   1375 	lea	0x80($out), $out
   1376 	sub	\$8,$len
   1377 	jnc	.Lecb_dec_loop
   1378 
   1379 	add	\$8,$len
   1380 	jz	.Lecb_dec_done
   1381 
   1382 	movdqu	0x00($inp), @XMM[0]	# load input
   1383 	mov	%rsp, %rax		# pass key schedule
   1384 	mov	%ebx,%r10d		# pass rounds
   1385 	cmp	\$2,$len
   1386 	jb	.Lecb_dec_one
   1387 	movdqu	0x10($inp), @XMM[1]
   1388 	je	.Lecb_dec_two
   1389 	movdqu	0x20($inp), @XMM[2]
   1390 	cmp	\$4,$len
   1391 	jb	.Lecb_dec_three
   1392 	movdqu	0x30($inp), @XMM[3]
   1393 	je	.Lecb_dec_four
   1394 	movdqu	0x40($inp), @XMM[4]
   1395 	cmp	\$6,$len
   1396 	jb	.Lecb_dec_five
   1397 	movdqu	0x50($inp), @XMM[5]
   1398 	je	.Lecb_dec_six
   1399 	movdqu	0x60($inp), @XMM[6]
   1400 	call	_bsaes_decrypt8
   1401 	movdqu	@XMM[0], 0x00($out)	# write output
   1402 	movdqu	@XMM[1], 0x10($out)
   1403 	movdqu	@XMM[6], 0x20($out)
   1404 	movdqu	@XMM[4], 0x30($out)
   1405 	movdqu	@XMM[2], 0x40($out)
   1406 	movdqu	@XMM[7], 0x50($out)
   1407 	movdqu	@XMM[3], 0x60($out)
   1408 	jmp	.Lecb_dec_done
   1409 .align	16
   1410 .Lecb_dec_six:
   1411 	call	_bsaes_decrypt8
   1412 	movdqu	@XMM[0], 0x00($out)	# write output
   1413 	movdqu	@XMM[1], 0x10($out)
   1414 	movdqu	@XMM[6], 0x20($out)
   1415 	movdqu	@XMM[4], 0x30($out)
   1416 	movdqu	@XMM[2], 0x40($out)
   1417 	movdqu	@XMM[7], 0x50($out)
   1418 	jmp	.Lecb_dec_done
   1419 .align	16
   1420 .Lecb_dec_five:
   1421 	call	_bsaes_decrypt8
   1422 	movdqu	@XMM[0], 0x00($out)	# write output
   1423 	movdqu	@XMM[1], 0x10($out)
   1424 	movdqu	@XMM[6], 0x20($out)
   1425 	movdqu	@XMM[4], 0x30($out)
   1426 	movdqu	@XMM[2], 0x40($out)
   1427 	jmp	.Lecb_dec_done
   1428 .align	16
   1429 .Lecb_dec_four:
   1430 	call	_bsaes_decrypt8
   1431 	movdqu	@XMM[0], 0x00($out)	# write output
   1432 	movdqu	@XMM[1], 0x10($out)
   1433 	movdqu	@XMM[6], 0x20($out)
   1434 	movdqu	@XMM[4], 0x30($out)
   1435 	jmp	.Lecb_dec_done
   1436 .align	16
   1437 .Lecb_dec_three:
   1438 	call	_bsaes_decrypt8
   1439 	movdqu	@XMM[0], 0x00($out)	# write output
   1440 	movdqu	@XMM[1], 0x10($out)
   1441 	movdqu	@XMM[6], 0x20($out)
   1442 	jmp	.Lecb_dec_done
   1443 .align	16
   1444 .Lecb_dec_two:
   1445 	call	_bsaes_decrypt8
   1446 	movdqu	@XMM[0], 0x00($out)	# write output
   1447 	movdqu	@XMM[1], 0x10($out)
   1448 	jmp	.Lecb_dec_done
   1449 .align	16
   1450 .Lecb_dec_one:
   1451 	call	_bsaes_decrypt8
   1452 	movdqu	@XMM[0], 0x00($out)	# write output
   1453 	jmp	.Lecb_dec_done
   1454 .align	16
   1455 .Lecb_dec_short:
   1456 	lea	($inp), $arg1
   1457 	lea	($out), $arg2
   1458 	lea	($key), $arg3
   1459 	call	asm_AES_decrypt
   1460 	lea	16($inp), $inp
   1461 	lea	16($out), $out
   1462 	dec	$len
   1463 	jnz	.Lecb_dec_short
   1464 
   1465 .Lecb_dec_done:
   1466 	lea	(%rsp),%rax
   1467 	pxor	%xmm0, %xmm0
   1468 .Lecb_dec_bzero:			# wipe key schedule [if any]
   1469 	movdqa	%xmm0, 0x00(%rax)
   1470 	movdqa	%xmm0, 0x10(%rax)
   1471 	lea	0x20(%rax), %rax
   1472 	cmp	%rax, %rbp
   1473 	jb	.Lecb_dec_bzero
   1474 
   1475 	lea	(%rbp),%rsp		# restore %rsp
   1476 ___
   1477 $code.=<<___ if ($win64);
   1478 	movaps	0x40(%rbp), %xmm6
   1479 	movaps	0x50(%rbp), %xmm7
   1480 	movaps	0x60(%rbp), %xmm8
   1481 	movaps	0x70(%rbp), %xmm9
   1482 	movaps	0x80(%rbp), %xmm10
   1483 	movaps	0x90(%rbp), %xmm11
   1484 	movaps	0xa0(%rbp), %xmm12
   1485 	movaps	0xb0(%rbp), %xmm13
   1486 	movaps	0xc0(%rbp), %xmm14
   1487 	movaps	0xd0(%rbp), %xmm15
   1488 	lea	0xa0(%rbp), %rsp
   1489 ___
   1490 $code.=<<___;
   1491 	mov	0x48(%rsp), %r15
   1492 	mov	0x50(%rsp), %r14
   1493 	mov	0x58(%rsp), %r13
   1494 	mov	0x60(%rsp), %r12
   1495 	mov	0x68(%rsp), %rbx
   1496 	mov	0x70(%rsp), %rax
   1497 	lea	0x78(%rsp), %rsp
   1498 	mov	%rax, %rbp
   1499 .Lecb_dec_epilogue:
   1500 	ret
   1501 .size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
   1502 ___
   1503 }
   1504 $code.=<<___;
   1505 .extern	asm_AES_cbc_encrypt
   1506 .globl	bsaes_cbc_encrypt
   1507 .type	bsaes_cbc_encrypt,\@abi-omnipotent
   1508 .align	16
   1509 bsaes_cbc_encrypt:
   1510 ___
   1511 $code.=<<___ if ($win64);
   1512 	mov	48(%rsp),$arg6		# pull direction flag
   1513 ___
   1514 $code.=<<___;
   1515 	cmp	\$0,$arg6
   1516 	jne	asm_AES_cbc_encrypt
   1517 	cmp	\$128,$arg3
   1518 	jb	asm_AES_cbc_encrypt
   1519 
   1520 	mov	%rsp, %rax
   1521 .Lcbc_dec_prologue:
   1522 	push	%rbp
   1523 	push	%rbx
   1524 	push	%r12
   1525 	push	%r13
   1526 	push	%r14
   1527 	push	%r15
   1528 	lea	-0x48(%rsp), %rsp
   1529 ___
   1530 $code.=<<___ if ($win64);
   1531 	mov	0xa0(%rsp),$arg5	# pull ivp
   1532 	lea	-0xa0(%rsp), %rsp
   1533 	movaps	%xmm6, 0x40(%rsp)
   1534 	movaps	%xmm7, 0x50(%rsp)
   1535 	movaps	%xmm8, 0x60(%rsp)
   1536 	movaps	%xmm9, 0x70(%rsp)
   1537 	movaps	%xmm10, 0x80(%rsp)
   1538 	movaps	%xmm11, 0x90(%rsp)
   1539 	movaps	%xmm12, 0xa0(%rsp)
   1540 	movaps	%xmm13, 0xb0(%rsp)
   1541 	movaps	%xmm14, 0xc0(%rsp)
   1542 	movaps	%xmm15, 0xd0(%rsp)
   1543 .Lcbc_dec_body:
   1544 ___
   1545 $code.=<<___;
   1546 	mov	%rsp, %rbp		# backup %rsp
   1547 	mov	240($arg4), %eax	# rounds
   1548 	mov	$arg1, $inp		# backup arguments
   1549 	mov	$arg2, $out
   1550 	mov	$arg3, $len
   1551 	mov	$arg4, $key
   1552 	mov	$arg5, %rbx
   1553 	shr	\$4, $len		# bytes to blocks
   1554 
   1555 	mov	%eax, %edx		# rounds
   1556 	shl	\$7, %rax		# 128 bytes per inner round key
   1557 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   1558 	sub	%rax, %rsp
   1559 
   1560 	mov	%rsp, %rax		# pass key schedule
   1561 	mov	$key, %rcx		# pass key
   1562 	mov	%edx, %r10d		# pass rounds
   1563 	call	_bsaes_key_convert
   1564 	pxor	(%rsp),%xmm7		# fix up 0 round key
   1565 	movdqa	%xmm6,(%rax)		# save last round key
   1566 	movdqa	%xmm7,(%rsp)
   1567 
   1568 	movdqu	(%rbx), @XMM[15]	# load IV
   1569 	sub	\$8,$len
   1570 .Lcbc_dec_loop:
   1571 	movdqu	0x00($inp), @XMM[0]	# load input
   1572 	movdqu	0x10($inp), @XMM[1]
   1573 	movdqu	0x20($inp), @XMM[2]
   1574 	movdqu	0x30($inp), @XMM[3]
   1575 	movdqu	0x40($inp), @XMM[4]
   1576 	movdqu	0x50($inp), @XMM[5]
   1577 	mov	%rsp, %rax		# pass key schedule
   1578 	movdqu	0x60($inp), @XMM[6]
   1579 	mov	%edx,%r10d		# pass rounds
   1580 	movdqu	0x70($inp), @XMM[7]
   1581 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1582 
   1583 	call	_bsaes_decrypt8
   1584 
   1585 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1586 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1587 	movdqu	0x10($inp), @XMM[9]
   1588 	pxor	@XMM[8], @XMM[1]
   1589 	movdqu	0x20($inp), @XMM[10]
   1590 	pxor	@XMM[9], @XMM[6]
   1591 	movdqu	0x30($inp), @XMM[11]
   1592 	pxor	@XMM[10], @XMM[4]
   1593 	movdqu	0x40($inp), @XMM[12]
   1594 	pxor	@XMM[11], @XMM[2]
   1595 	movdqu	0x50($inp), @XMM[13]
   1596 	pxor	@XMM[12], @XMM[7]
   1597 	movdqu	0x60($inp), @XMM[14]
   1598 	pxor	@XMM[13], @XMM[3]
   1599 	movdqu	0x70($inp), @XMM[15]	# IV
   1600 	pxor	@XMM[14], @XMM[5]
   1601 	movdqu	@XMM[0], 0x00($out)	# write output
   1602 	lea	0x80($inp), $inp
   1603 	movdqu	@XMM[1], 0x10($out)
   1604 	movdqu	@XMM[6], 0x20($out)
   1605 	movdqu	@XMM[4], 0x30($out)
   1606 	movdqu	@XMM[2], 0x40($out)
   1607 	movdqu	@XMM[7], 0x50($out)
   1608 	movdqu	@XMM[3], 0x60($out)
   1609 	movdqu	@XMM[5], 0x70($out)
   1610 	lea	0x80($out), $out
   1611 	sub	\$8,$len
   1612 	jnc	.Lcbc_dec_loop
   1613 
   1614 	add	\$8,$len
   1615 	jz	.Lcbc_dec_done
   1616 
   1617 	movdqu	0x00($inp), @XMM[0]	# load input
   1618 	mov	%rsp, %rax		# pass key schedule
   1619 	mov	%edx, %r10d		# pass rounds
   1620 	cmp	\$2,$len
   1621 	jb	.Lcbc_dec_one
   1622 	movdqu	0x10($inp), @XMM[1]
   1623 	je	.Lcbc_dec_two
   1624 	movdqu	0x20($inp), @XMM[2]
   1625 	cmp	\$4,$len
   1626 	jb	.Lcbc_dec_three
   1627 	movdqu	0x30($inp), @XMM[3]
   1628 	je	.Lcbc_dec_four
   1629 	movdqu	0x40($inp), @XMM[4]
   1630 	cmp	\$6,$len
   1631 	jb	.Lcbc_dec_five
   1632 	movdqu	0x50($inp), @XMM[5]
   1633 	je	.Lcbc_dec_six
   1634 	movdqu	0x60($inp), @XMM[6]
   1635 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1636 	call	_bsaes_decrypt8
   1637 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1638 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1639 	movdqu	0x10($inp), @XMM[9]
   1640 	pxor	@XMM[8], @XMM[1]
   1641 	movdqu	0x20($inp), @XMM[10]
   1642 	pxor	@XMM[9], @XMM[6]
   1643 	movdqu	0x30($inp), @XMM[11]
   1644 	pxor	@XMM[10], @XMM[4]
   1645 	movdqu	0x40($inp), @XMM[12]
   1646 	pxor	@XMM[11], @XMM[2]
   1647 	movdqu	0x50($inp), @XMM[13]
   1648 	pxor	@XMM[12], @XMM[7]
   1649 	movdqu	0x60($inp), @XMM[15]	# IV
   1650 	pxor	@XMM[13], @XMM[3]
   1651 	movdqu	@XMM[0], 0x00($out)	# write output
   1652 	movdqu	@XMM[1], 0x10($out)
   1653 	movdqu	@XMM[6], 0x20($out)
   1654 	movdqu	@XMM[4], 0x30($out)
   1655 	movdqu	@XMM[2], 0x40($out)
   1656 	movdqu	@XMM[7], 0x50($out)
   1657 	movdqu	@XMM[3], 0x60($out)
   1658 	jmp	.Lcbc_dec_done
   1659 .align	16
   1660 .Lcbc_dec_six:
   1661 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1662 	call	_bsaes_decrypt8
   1663 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1664 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1665 	movdqu	0x10($inp), @XMM[9]
   1666 	pxor	@XMM[8], @XMM[1]
   1667 	movdqu	0x20($inp), @XMM[10]
   1668 	pxor	@XMM[9], @XMM[6]
   1669 	movdqu	0x30($inp), @XMM[11]
   1670 	pxor	@XMM[10], @XMM[4]
   1671 	movdqu	0x40($inp), @XMM[12]
   1672 	pxor	@XMM[11], @XMM[2]
   1673 	movdqu	0x50($inp), @XMM[15]	# IV
   1674 	pxor	@XMM[12], @XMM[7]
   1675 	movdqu	@XMM[0], 0x00($out)	# write output
   1676 	movdqu	@XMM[1], 0x10($out)
   1677 	movdqu	@XMM[6], 0x20($out)
   1678 	movdqu	@XMM[4], 0x30($out)
   1679 	movdqu	@XMM[2], 0x40($out)
   1680 	movdqu	@XMM[7], 0x50($out)
   1681 	jmp	.Lcbc_dec_done
   1682 .align	16
   1683 .Lcbc_dec_five:
   1684 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1685 	call	_bsaes_decrypt8
   1686 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1687 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1688 	movdqu	0x10($inp), @XMM[9]
   1689 	pxor	@XMM[8], @XMM[1]
   1690 	movdqu	0x20($inp), @XMM[10]
   1691 	pxor	@XMM[9], @XMM[6]
   1692 	movdqu	0x30($inp), @XMM[11]
   1693 	pxor	@XMM[10], @XMM[4]
   1694 	movdqu	0x40($inp), @XMM[15]	# IV
   1695 	pxor	@XMM[11], @XMM[2]
   1696 	movdqu	@XMM[0], 0x00($out)	# write output
   1697 	movdqu	@XMM[1], 0x10($out)
   1698 	movdqu	@XMM[6], 0x20($out)
   1699 	movdqu	@XMM[4], 0x30($out)
   1700 	movdqu	@XMM[2], 0x40($out)
   1701 	jmp	.Lcbc_dec_done
   1702 .align	16
   1703 .Lcbc_dec_four:
   1704 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1705 	call	_bsaes_decrypt8
   1706 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1707 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1708 	movdqu	0x10($inp), @XMM[9]
   1709 	pxor	@XMM[8], @XMM[1]
   1710 	movdqu	0x20($inp), @XMM[10]
   1711 	pxor	@XMM[9], @XMM[6]
   1712 	movdqu	0x30($inp), @XMM[15]	# IV
   1713 	pxor	@XMM[10], @XMM[4]
   1714 	movdqu	@XMM[0], 0x00($out)	# write output
   1715 	movdqu	@XMM[1], 0x10($out)
   1716 	movdqu	@XMM[6], 0x20($out)
   1717 	movdqu	@XMM[4], 0x30($out)
   1718 	jmp	.Lcbc_dec_done
   1719 .align	16
   1720 .Lcbc_dec_three:
   1721 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1722 	call	_bsaes_decrypt8
   1723 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1724 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1725 	movdqu	0x10($inp), @XMM[9]
   1726 	pxor	@XMM[8], @XMM[1]
   1727 	movdqu	0x20($inp), @XMM[15]	# IV
   1728 	pxor	@XMM[9], @XMM[6]
   1729 	movdqu	@XMM[0], 0x00($out)	# write output
   1730 	movdqu	@XMM[1], 0x10($out)
   1731 	movdqu	@XMM[6], 0x20($out)
   1732 	jmp	.Lcbc_dec_done
   1733 .align	16
   1734 .Lcbc_dec_two:
   1735 	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
   1736 	call	_bsaes_decrypt8
   1737 	pxor	0x20(%rbp), @XMM[0]	# ^= IV
   1738 	movdqu	0x00($inp), @XMM[8]	# re-load input
   1739 	movdqu	0x10($inp), @XMM[15]	# IV
   1740 	pxor	@XMM[8], @XMM[1]
   1741 	movdqu	@XMM[0], 0x00($out)	# write output
   1742 	movdqu	@XMM[1], 0x10($out)
   1743 	jmp	.Lcbc_dec_done
   1744 .align	16
   1745 .Lcbc_dec_one:
   1746 	lea	($inp), $arg1
   1747 	lea	0x20(%rbp), $arg2	# buffer output
   1748 	lea	($key), $arg3
   1749 	call	asm_AES_decrypt		# doesn't touch %xmm
   1750 	pxor	0x20(%rbp), @XMM[15]	# ^= IV
   1751 	movdqu	@XMM[15], ($out)	# write output
   1752 	movdqa	@XMM[0], @XMM[15]	# IV
   1753 
   1754 .Lcbc_dec_done:
   1755 	movdqu	@XMM[15], (%rbx)	# return IV
   1756 	lea	(%rsp), %rax
   1757 	pxor	%xmm0, %xmm0
   1758 .Lcbc_dec_bzero:			# wipe key schedule [if any]
   1759 	movdqa	%xmm0, 0x00(%rax)
   1760 	movdqa	%xmm0, 0x10(%rax)
   1761 	lea	0x20(%rax), %rax
   1762 	cmp	%rax, %rbp
   1763 	ja	.Lcbc_dec_bzero
   1764 
   1765 	lea	(%rbp),%rsp		# restore %rsp
   1766 ___
   1767 $code.=<<___ if ($win64);
   1768 	movaps	0x40(%rbp), %xmm6
   1769 	movaps	0x50(%rbp), %xmm7
   1770 	movaps	0x60(%rbp), %xmm8
   1771 	movaps	0x70(%rbp), %xmm9
   1772 	movaps	0x80(%rbp), %xmm10
   1773 	movaps	0x90(%rbp), %xmm11
   1774 	movaps	0xa0(%rbp), %xmm12
   1775 	movaps	0xb0(%rbp), %xmm13
   1776 	movaps	0xc0(%rbp), %xmm14
   1777 	movaps	0xd0(%rbp), %xmm15
   1778 	lea	0xa0(%rbp), %rsp
   1779 ___
   1780 $code.=<<___;
   1781 	mov	0x48(%rsp), %r15
   1782 	mov	0x50(%rsp), %r14
   1783 	mov	0x58(%rsp), %r13
   1784 	mov	0x60(%rsp), %r12
   1785 	mov	0x68(%rsp), %rbx
   1786 	mov	0x70(%rsp), %rax
   1787 	lea	0x78(%rsp), %rsp
   1788 	mov	%rax, %rbp
   1789 .Lcbc_dec_epilogue:
   1790 	ret
   1791 .size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
   1792 
   1793 .globl	bsaes_ctr32_encrypt_blocks
   1794 .type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
   1795 .align	16
   1796 bsaes_ctr32_encrypt_blocks:
   1797 	mov	%rsp, %rax
   1798 .Lctr_enc_prologue:
   1799 	push	%rbp
   1800 	push	%rbx
   1801 	push	%r12
   1802 	push	%r13
   1803 	push	%r14
   1804 	push	%r15
   1805 	lea	-0x48(%rsp), %rsp
   1806 ___
   1807 $code.=<<___ if ($win64);
   1808 	mov	0xa0(%rsp),$arg5	# pull ivp
   1809 	lea	-0xa0(%rsp), %rsp
   1810 	movaps	%xmm6, 0x40(%rsp)
   1811 	movaps	%xmm7, 0x50(%rsp)
   1812 	movaps	%xmm8, 0x60(%rsp)
   1813 	movaps	%xmm9, 0x70(%rsp)
   1814 	movaps	%xmm10, 0x80(%rsp)
   1815 	movaps	%xmm11, 0x90(%rsp)
   1816 	movaps	%xmm12, 0xa0(%rsp)
   1817 	movaps	%xmm13, 0xb0(%rsp)
   1818 	movaps	%xmm14, 0xc0(%rsp)
   1819 	movaps	%xmm15, 0xd0(%rsp)
   1820 .Lctr_enc_body:
   1821 ___
   1822 $code.=<<___;
   1823 	mov	%rsp, %rbp		# backup %rsp
   1824 	movdqu	($arg5), %xmm0		# load counter
   1825 	mov	240($arg4), %eax	# rounds
   1826 	mov	$arg1, $inp		# backup arguments
   1827 	mov	$arg2, $out
   1828 	mov	$arg3, $len
   1829 	mov	$arg4, $key
   1830 	movdqa	%xmm0, 0x20(%rbp)	# copy counter
   1831 	cmp	\$8, $arg3
   1832 	jb	.Lctr_enc_short
   1833 
   1834 	mov	%eax, %ebx		# rounds
   1835 	shl	\$7, %rax		# 128 bytes per inner round key
   1836 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   1837 	sub	%rax, %rsp
   1838 
   1839 	mov	%rsp, %rax		# pass key schedule
   1840 	mov	$key, %rcx		# pass key
   1841 	mov	%ebx, %r10d		# pass rounds
   1842 	call	_bsaes_key_convert
   1843 	pxor	%xmm6,%xmm7		# fix up last round key
   1844 	movdqa	%xmm7,(%rax)		# save last round key
   1845 
   1846 	movdqa	(%rsp), @XMM[9]		# load round0 key
   1847 	lea	.LADD1(%rip), %r11
   1848 	movdqa	0x20(%rbp), @XMM[0]	# counter copy
   1849 	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
   1850 	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
   1851 	pshufb	@XMM[8], @XMM[0]
   1852 	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
   1853 	jmp	.Lctr_enc_loop
   1854 .align	16
   1855 .Lctr_enc_loop:
   1856 	movdqa	@XMM[0], 0x20(%rbp)	# save counter
   1857 	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
   1858 	movdqa	@XMM[0], @XMM[2]
   1859 	paddd	0x00(%r11), @XMM[1]	# .LADD1
   1860 	movdqa	@XMM[0], @XMM[3]
   1861 	paddd	0x10(%r11), @XMM[2]	# .LADD2
   1862 	movdqa	@XMM[0], @XMM[4]
   1863 	paddd	0x20(%r11), @XMM[3]	# .LADD3
   1864 	movdqa	@XMM[0], @XMM[5]
   1865 	paddd	0x30(%r11), @XMM[4]	# .LADD4
   1866 	movdqa	@XMM[0], @XMM[6]
   1867 	paddd	0x40(%r11), @XMM[5]	# .LADD5
   1868 	movdqa	@XMM[0], @XMM[7]
   1869 	paddd	0x50(%r11), @XMM[6]	# .LADD6
   1870 	paddd	0x60(%r11), @XMM[7]	# .LADD7
   1871 
   1872 	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
   1873 	# to flip byte order in 32-bit counter
   1874 	movdqa	(%rsp), @XMM[9]		# round 0 key
   1875 	lea	0x10(%rsp), %rax	# pass key schedule
   1876 	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
   1877 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
   1878 	pxor	@XMM[9], @XMM[1]
   1879 	 pshufb	@XMM[8], @XMM[0]
   1880 	pxor	@XMM[9], @XMM[2]
   1881 	 pshufb	@XMM[8], @XMM[1]
   1882 	pxor	@XMM[9], @XMM[3]
   1883 	 pshufb	@XMM[8], @XMM[2]
   1884 	pxor	@XMM[9], @XMM[4]
   1885 	 pshufb	@XMM[8], @XMM[3]
   1886 	pxor	@XMM[9], @XMM[5]
   1887 	 pshufb	@XMM[8], @XMM[4]
   1888 	pxor	@XMM[9], @XMM[6]
   1889 	 pshufb	@XMM[8], @XMM[5]
   1890 	pxor	@XMM[9], @XMM[7]
   1891 	 pshufb	@XMM[8], @XMM[6]
   1892 	lea	.LBS0(%rip), %r11	# constants table
   1893 	 pshufb	@XMM[8], @XMM[7]
   1894 	mov	%ebx,%r10d		# pass rounds
   1895 
   1896 	call	_bsaes_encrypt8_bitslice
   1897 
   1898 	sub	\$8,$len
   1899 	jc	.Lctr_enc_loop_done
   1900 
   1901 	movdqu	0x00($inp), @XMM[8]	# load input
   1902 	movdqu	0x10($inp), @XMM[9]
   1903 	movdqu	0x20($inp), @XMM[10]
   1904 	movdqu	0x30($inp), @XMM[11]
   1905 	movdqu	0x40($inp), @XMM[12]
   1906 	movdqu	0x50($inp), @XMM[13]
   1907 	movdqu	0x60($inp), @XMM[14]
   1908 	movdqu	0x70($inp), @XMM[15]
   1909 	lea	0x80($inp),$inp
   1910 	pxor	@XMM[0], @XMM[8]
   1911 	movdqa	0x20(%rbp), @XMM[0]	# load counter
   1912 	pxor	@XMM[9], @XMM[1]
   1913 	movdqu	@XMM[8], 0x00($out)	# write output
   1914 	pxor	@XMM[10], @XMM[4]
   1915 	movdqu	@XMM[1], 0x10($out)
   1916 	pxor	@XMM[11], @XMM[6]
   1917 	movdqu	@XMM[4], 0x20($out)
   1918 	pxor	@XMM[12], @XMM[3]
   1919 	movdqu	@XMM[6], 0x30($out)
   1920 	pxor	@XMM[13], @XMM[7]
   1921 	movdqu	@XMM[3], 0x40($out)
   1922 	pxor	@XMM[14], @XMM[2]
   1923 	movdqu	@XMM[7], 0x50($out)
   1924 	pxor	@XMM[15], @XMM[5]
   1925 	movdqu	@XMM[2], 0x60($out)
   1926 	lea	.LADD1(%rip), %r11
   1927 	movdqu	@XMM[5], 0x70($out)
   1928 	lea	0x80($out), $out
   1929 	paddd	0x70(%r11), @XMM[0]	# .LADD8
   1930 	jnz	.Lctr_enc_loop
   1931 
   1932 	jmp	.Lctr_enc_done
   1933 .align	16
   1934 .Lctr_enc_loop_done:
   1935 	add	\$8, $len
   1936 	movdqu	0x00($inp), @XMM[8]	# load input
   1937 	pxor	@XMM[8], @XMM[0]
   1938 	movdqu	@XMM[0], 0x00($out)	# write output
   1939 	cmp	\$2,$len
   1940 	jb	.Lctr_enc_done
   1941 	movdqu	0x10($inp), @XMM[9]
   1942 	pxor	@XMM[9], @XMM[1]
   1943 	movdqu	@XMM[1], 0x10($out)
   1944 	je	.Lctr_enc_done
   1945 	movdqu	0x20($inp), @XMM[10]
   1946 	pxor	@XMM[10], @XMM[4]
   1947 	movdqu	@XMM[4], 0x20($out)
   1948 	cmp	\$4,$len
   1949 	jb	.Lctr_enc_done
   1950 	movdqu	0x30($inp), @XMM[11]
   1951 	pxor	@XMM[11], @XMM[6]
   1952 	movdqu	@XMM[6], 0x30($out)
   1953 	je	.Lctr_enc_done
   1954 	movdqu	0x40($inp), @XMM[12]
   1955 	pxor	@XMM[12], @XMM[3]
   1956 	movdqu	@XMM[3], 0x40($out)
   1957 	cmp	\$6,$len
   1958 	jb	.Lctr_enc_done
   1959 	movdqu	0x50($inp), @XMM[13]
   1960 	pxor	@XMM[13], @XMM[7]
   1961 	movdqu	@XMM[7], 0x50($out)
   1962 	je	.Lctr_enc_done
   1963 	movdqu	0x60($inp), @XMM[14]
   1964 	pxor	@XMM[14], @XMM[2]
   1965 	movdqu	@XMM[2], 0x60($out)
   1966 	jmp	.Lctr_enc_done
   1967 
   1968 .align	16
   1969 .Lctr_enc_short:
   1970 	lea	0x20(%rbp), $arg1
   1971 	lea	0x30(%rbp), $arg2
   1972 	lea	($key), $arg3
   1973 	call	asm_AES_encrypt
   1974 	movdqu	($inp), @XMM[1]
   1975 	lea	16($inp), $inp
   1976 	mov	0x2c(%rbp), %eax	# load 32-bit counter
   1977 	bswap	%eax
   1978 	pxor	0x30(%rbp), @XMM[1]
   1979 	inc	%eax			# increment
   1980 	movdqu	@XMM[1], ($out)
   1981 	bswap	%eax
   1982 	lea	16($out), $out
   1983 	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
   1984 	dec	$len
   1985 	jnz	.Lctr_enc_short
   1986 
   1987 .Lctr_enc_done:
   1988 	lea	(%rsp), %rax
   1989 	pxor	%xmm0, %xmm0
   1990 .Lctr_enc_bzero:			# wipe key schedule [if any]
   1991 	movdqa	%xmm0, 0x00(%rax)
   1992 	movdqa	%xmm0, 0x10(%rax)
   1993 	lea	0x20(%rax), %rax
   1994 	cmp	%rax, %rbp
   1995 	ja	.Lctr_enc_bzero
   1996 
   1997 	lea	(%rbp),%rsp		# restore %rsp
   1998 ___
   1999 $code.=<<___ if ($win64);
   2000 	movaps	0x40(%rbp), %xmm6
   2001 	movaps	0x50(%rbp), %xmm7
   2002 	movaps	0x60(%rbp), %xmm8
   2003 	movaps	0x70(%rbp), %xmm9
   2004 	movaps	0x80(%rbp), %xmm10
   2005 	movaps	0x90(%rbp), %xmm11
   2006 	movaps	0xa0(%rbp), %xmm12
   2007 	movaps	0xb0(%rbp), %xmm13
   2008 	movaps	0xc0(%rbp), %xmm14
   2009 	movaps	0xd0(%rbp), %xmm15
   2010 	lea	0xa0(%rbp), %rsp
   2011 ___
   2012 $code.=<<___;
   2013 	mov	0x48(%rsp), %r15
   2014 	mov	0x50(%rsp), %r14
   2015 	mov	0x58(%rsp), %r13
   2016 	mov	0x60(%rsp), %r12
   2017 	mov	0x68(%rsp), %rbx
   2018 	mov	0x70(%rsp), %rax
   2019 	lea	0x78(%rsp), %rsp
   2020 	mov	%rax, %rbp
   2021 .Lctr_enc_epilogue:
   2022 	ret
   2023 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
   2024 ___
   2025 ######################################################################
   2026 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
   2027 #	const AES_KEY *key1, const AES_KEY *key2,
   2028 #	const unsigned char iv[16]);
   2029 #
   2030 my ($twmask,$twres,$twtmp)=@XMM[13..15];
   2031 $code.=<<___;
   2032 .globl	bsaes_xts_encrypt
   2033 .type	bsaes_xts_encrypt,\@abi-omnipotent
   2034 .align	16
   2035 bsaes_xts_encrypt:
   2036 	mov	%rsp, %rax
   2037 .Lxts_enc_prologue:
   2038 	push	%rbp
   2039 	push	%rbx
   2040 	push	%r12
   2041 	push	%r13
   2042 	push	%r14
   2043 	push	%r15
   2044 	lea	-0x48(%rsp), %rsp
   2045 ___
   2046 $code.=<<___ if ($win64);
   2047 	mov	0xa0(%rsp),$arg5	# pull key2
   2048 	mov	0xa8(%rsp),$arg6	# pull ivp
   2049 	lea	-0xa0(%rsp), %rsp
   2050 	movaps	%xmm6, 0x40(%rsp)
   2051 	movaps	%xmm7, 0x50(%rsp)
   2052 	movaps	%xmm8, 0x60(%rsp)
   2053 	movaps	%xmm9, 0x70(%rsp)
   2054 	movaps	%xmm10, 0x80(%rsp)
   2055 	movaps	%xmm11, 0x90(%rsp)
   2056 	movaps	%xmm12, 0xa0(%rsp)
   2057 	movaps	%xmm13, 0xb0(%rsp)
   2058 	movaps	%xmm14, 0xc0(%rsp)
   2059 	movaps	%xmm15, 0xd0(%rsp)
   2060 .Lxts_enc_body:
   2061 ___
   2062 $code.=<<___;
   2063 	mov	%rsp, %rbp		# backup %rsp
   2064 	mov	$arg1, $inp		# backup arguments
   2065 	mov	$arg2, $out
   2066 	mov	$arg3, $len
   2067 	mov	$arg4, $key
   2068 
   2069 	lea	($arg6), $arg1
   2070 	lea	0x20(%rbp), $arg2
   2071 	lea	($arg5), $arg3
   2072 	call	asm_AES_encrypt		# generate initial tweak
   2073 
   2074 	mov	240($key), %eax		# rounds
   2075 	mov	$len, %rbx		# backup $len
   2076 
   2077 	mov	%eax, %edx		# rounds
   2078 	shl	\$7, %rax		# 128 bytes per inner round key
   2079 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   2080 	sub	%rax, %rsp
   2081 
   2082 	mov	%rsp, %rax		# pass key schedule
   2083 	mov	$key, %rcx		# pass key
   2084 	mov	%edx, %r10d		# pass rounds
   2085 	call	_bsaes_key_convert
   2086 	pxor	%xmm6, %xmm7		# fix up last round key
   2087 	movdqa	%xmm7, (%rax)		# save last round key
   2088 
   2089 	and	\$-16, $len
   2090 	sub	\$0x80, %rsp		# place for tweak[8]
   2091 	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
   2092 
   2093 	pxor	$twtmp, $twtmp
   2094 	movdqa	.Lxts_magic(%rip), $twmask
   2095 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2096 
   2097 	sub	\$0x80, $len
   2098 	jc	.Lxts_enc_short
   2099 	jmp	.Lxts_enc_loop
   2100 
   2101 .align	16
   2102 .Lxts_enc_loop:
   2103 ___
   2104     for ($i=0;$i<7;$i++) {
   2105     $code.=<<___;
   2106 	pshufd	\$0x13, $twtmp, $twres
   2107 	pxor	$twtmp, $twtmp
   2108 	movdqa	@XMM[7], @XMM[$i]
   2109 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2110 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2111 	pand	$twmask, $twres		# isolate carry and residue
   2112 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2113 	pxor	$twres, @XMM[7]
   2114 ___
   2115     $code.=<<___ if ($i>=1);
   2116 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2117 ___
   2118     $code.=<<___ if ($i>=2);
   2119 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2120 ___
   2121     }
   2122 $code.=<<___;
   2123 	movdqu	0x60($inp), @XMM[8+6]
   2124 	pxor	@XMM[8+5], @XMM[5]
   2125 	movdqu	0x70($inp), @XMM[8+7]
   2126 	lea	0x80($inp), $inp
   2127 	movdqa	@XMM[7], 0x70(%rsp)
   2128 	pxor	@XMM[8+6], @XMM[6]
   2129 	lea	0x80(%rsp), %rax	# pass key schedule
   2130 	pxor	@XMM[8+7], @XMM[7]
   2131 	mov	%edx, %r10d		# pass rounds
   2132 
   2133 	call	_bsaes_encrypt8
   2134 
   2135 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2136 	pxor	0x10(%rsp), @XMM[1]
   2137 	movdqu	@XMM[0], 0x00($out)	# write output
   2138 	pxor	0x20(%rsp), @XMM[4]
   2139 	movdqu	@XMM[1], 0x10($out)
   2140 	pxor	0x30(%rsp), @XMM[6]
   2141 	movdqu	@XMM[4], 0x20($out)
   2142 	pxor	0x40(%rsp), @XMM[3]
   2143 	movdqu	@XMM[6], 0x30($out)
   2144 	pxor	0x50(%rsp), @XMM[7]
   2145 	movdqu	@XMM[3], 0x40($out)
   2146 	pxor	0x60(%rsp), @XMM[2]
   2147 	movdqu	@XMM[7], 0x50($out)
   2148 	pxor	0x70(%rsp), @XMM[5]
   2149 	movdqu	@XMM[2], 0x60($out)
   2150 	movdqu	@XMM[5], 0x70($out)
   2151 	lea	0x80($out), $out
   2152 
   2153 	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
   2154 	pxor	$twtmp, $twtmp
   2155 	movdqa	.Lxts_magic(%rip), $twmask
   2156 	pcmpgtd	@XMM[7], $twtmp
   2157 	pshufd	\$0x13, $twtmp, $twres
   2158 	pxor	$twtmp, $twtmp
   2159 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2160 	pand	$twmask, $twres		# isolate carry and residue
   2161 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2162 	pxor	$twres, @XMM[7]
   2163 
   2164 	sub	\$0x80,$len
   2165 	jnc	.Lxts_enc_loop
   2166 
   2167 .Lxts_enc_short:
   2168 	add	\$0x80, $len
   2169 	jz	.Lxts_enc_done
   2170 ___
   2171     for ($i=0;$i<7;$i++) {
   2172     $code.=<<___;
   2173 	pshufd	\$0x13, $twtmp, $twres
   2174 	pxor	$twtmp, $twtmp
   2175 	movdqa	@XMM[7], @XMM[$i]
   2176 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2177 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2178 	pand	$twmask, $twres		# isolate carry and residue
   2179 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2180 	pxor	$twres, @XMM[7]
   2181 ___
   2182     $code.=<<___ if ($i>=1);
   2183 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2184 	cmp	\$`0x10*$i`,$len
   2185 	je	.Lxts_enc_$i
   2186 ___
   2187     $code.=<<___ if ($i>=2);
   2188 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2189 ___
   2190     }
   2191 $code.=<<___;
   2192 	movdqu	0x60($inp), @XMM[8+6]
   2193 	pxor	@XMM[8+5], @XMM[5]
   2194 	movdqa	@XMM[7], 0x70(%rsp)
   2195 	lea	0x70($inp), $inp
   2196 	pxor	@XMM[8+6], @XMM[6]
   2197 	lea	0x80(%rsp), %rax	# pass key schedule
   2198 	mov	%edx, %r10d		# pass rounds
   2199 
   2200 	call	_bsaes_encrypt8
   2201 
   2202 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2203 	pxor	0x10(%rsp), @XMM[1]
   2204 	movdqu	@XMM[0], 0x00($out)	# write output
   2205 	pxor	0x20(%rsp), @XMM[4]
   2206 	movdqu	@XMM[1], 0x10($out)
   2207 	pxor	0x30(%rsp), @XMM[6]
   2208 	movdqu	@XMM[4], 0x20($out)
   2209 	pxor	0x40(%rsp), @XMM[3]
   2210 	movdqu	@XMM[6], 0x30($out)
   2211 	pxor	0x50(%rsp), @XMM[7]
   2212 	movdqu	@XMM[3], 0x40($out)
   2213 	pxor	0x60(%rsp), @XMM[2]
   2214 	movdqu	@XMM[7], 0x50($out)
   2215 	movdqu	@XMM[2], 0x60($out)
   2216 	lea	0x70($out), $out
   2217 
   2218 	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
   2219 	jmp	.Lxts_enc_done
   2220 .align	16
   2221 .Lxts_enc_6:
   2222 	pxor	@XMM[8+4], @XMM[4]
   2223 	lea	0x60($inp), $inp
   2224 	pxor	@XMM[8+5], @XMM[5]
   2225 	lea	0x80(%rsp), %rax	# pass key schedule
   2226 	mov	%edx, %r10d		# pass rounds
   2227 
   2228 	call	_bsaes_encrypt8
   2229 
   2230 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2231 	pxor	0x10(%rsp), @XMM[1]
   2232 	movdqu	@XMM[0], 0x00($out)	# write output
   2233 	pxor	0x20(%rsp), @XMM[4]
   2234 	movdqu	@XMM[1], 0x10($out)
   2235 	pxor	0x30(%rsp), @XMM[6]
   2236 	movdqu	@XMM[4], 0x20($out)
   2237 	pxor	0x40(%rsp), @XMM[3]
   2238 	movdqu	@XMM[6], 0x30($out)
   2239 	pxor	0x50(%rsp), @XMM[7]
   2240 	movdqu	@XMM[3], 0x40($out)
   2241 	movdqu	@XMM[7], 0x50($out)
   2242 	lea	0x60($out), $out
   2243 
   2244 	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
   2245 	jmp	.Lxts_enc_done
   2246 .align	16
   2247 .Lxts_enc_5:
   2248 	pxor	@XMM[8+3], @XMM[3]
   2249 	lea	0x50($inp), $inp
   2250 	pxor	@XMM[8+4], @XMM[4]
   2251 	lea	0x80(%rsp), %rax	# pass key schedule
   2252 	mov	%edx, %r10d		# pass rounds
   2253 
   2254 	call	_bsaes_encrypt8
   2255 
   2256 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2257 	pxor	0x10(%rsp), @XMM[1]
   2258 	movdqu	@XMM[0], 0x00($out)	# write output
   2259 	pxor	0x20(%rsp), @XMM[4]
   2260 	movdqu	@XMM[1], 0x10($out)
   2261 	pxor	0x30(%rsp), @XMM[6]
   2262 	movdqu	@XMM[4], 0x20($out)
   2263 	pxor	0x40(%rsp), @XMM[3]
   2264 	movdqu	@XMM[6], 0x30($out)
   2265 	movdqu	@XMM[3], 0x40($out)
   2266 	lea	0x50($out), $out
   2267 
   2268 	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
   2269 	jmp	.Lxts_enc_done
   2270 .align	16
   2271 .Lxts_enc_4:
   2272 	pxor	@XMM[8+2], @XMM[2]
   2273 	lea	0x40($inp), $inp
   2274 	pxor	@XMM[8+3], @XMM[3]
   2275 	lea	0x80(%rsp), %rax	# pass key schedule
   2276 	mov	%edx, %r10d		# pass rounds
   2277 
   2278 	call	_bsaes_encrypt8
   2279 
   2280 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2281 	pxor	0x10(%rsp), @XMM[1]
   2282 	movdqu	@XMM[0], 0x00($out)	# write output
   2283 	pxor	0x20(%rsp), @XMM[4]
   2284 	movdqu	@XMM[1], 0x10($out)
   2285 	pxor	0x30(%rsp), @XMM[6]
   2286 	movdqu	@XMM[4], 0x20($out)
   2287 	movdqu	@XMM[6], 0x30($out)
   2288 	lea	0x40($out), $out
   2289 
   2290 	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
   2291 	jmp	.Lxts_enc_done
   2292 .align	16
   2293 .Lxts_enc_3:
   2294 	pxor	@XMM[8+1], @XMM[1]
   2295 	lea	0x30($inp), $inp
   2296 	pxor	@XMM[8+2], @XMM[2]
   2297 	lea	0x80(%rsp), %rax	# pass key schedule
   2298 	mov	%edx, %r10d		# pass rounds
   2299 
   2300 	call	_bsaes_encrypt8
   2301 
   2302 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2303 	pxor	0x10(%rsp), @XMM[1]
   2304 	movdqu	@XMM[0], 0x00($out)	# write output
   2305 	pxor	0x20(%rsp), @XMM[4]
   2306 	movdqu	@XMM[1], 0x10($out)
   2307 	movdqu	@XMM[4], 0x20($out)
   2308 	lea	0x30($out), $out
   2309 
   2310 	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
   2311 	jmp	.Lxts_enc_done
   2312 .align	16
   2313 .Lxts_enc_2:
   2314 	pxor	@XMM[8+0], @XMM[0]
   2315 	lea	0x20($inp), $inp
   2316 	pxor	@XMM[8+1], @XMM[1]
   2317 	lea	0x80(%rsp), %rax	# pass key schedule
   2318 	mov	%edx, %r10d		# pass rounds
   2319 
   2320 	call	_bsaes_encrypt8
   2321 
   2322 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2323 	pxor	0x10(%rsp), @XMM[1]
   2324 	movdqu	@XMM[0], 0x00($out)	# write output
   2325 	movdqu	@XMM[1], 0x10($out)
   2326 	lea	0x20($out), $out
   2327 
   2328 	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
   2329 	jmp	.Lxts_enc_done
   2330 .align	16
   2331 .Lxts_enc_1:
   2332 	pxor	@XMM[0], @XMM[8]
   2333 	lea	0x10($inp), $inp
   2334 	movdqa	@XMM[8], 0x20(%rbp)
   2335 	lea	0x20(%rbp), $arg1
   2336 	lea	0x20(%rbp), $arg2
   2337 	lea	($key), $arg3
   2338 	call	asm_AES_encrypt		# doesn't touch %xmm
   2339 	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
   2340 	#pxor	@XMM[8], @XMM[0]
   2341 	#lea	0x80(%rsp), %rax	# pass key schedule
   2342 	#mov	%edx, %r10d		# pass rounds
   2343 	#call	_bsaes_encrypt8
   2344 	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2345 	movdqu	@XMM[0], 0x00($out)	# write output
   2346 	lea	0x10($out), $out
   2347 
   2348 	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
   2349 
   2350 .Lxts_enc_done:
   2351 	and	\$15, %ebx
   2352 	jz	.Lxts_enc_ret
   2353 	mov	$out, %rdx
   2354 
   2355 .Lxts_enc_steal:
   2356 	movzb	($inp), %eax
   2357 	movzb	-16(%rdx), %ecx
   2358 	lea	1($inp), $inp
   2359 	mov	%al, -16(%rdx)
   2360 	mov	%cl, 0(%rdx)
   2361 	lea	1(%rdx), %rdx
   2362 	sub	\$1,%ebx
   2363 	jnz	.Lxts_enc_steal
   2364 
   2365 	movdqu	-16($out), @XMM[0]
   2366 	lea	0x20(%rbp), $arg1
   2367 	pxor	@XMM[7], @XMM[0]
   2368 	lea	0x20(%rbp), $arg2
   2369 	movdqa	@XMM[0], 0x20(%rbp)
   2370 	lea	($key), $arg3
   2371 	call	asm_AES_encrypt		# doesn't touch %xmm
   2372 	pxor	0x20(%rbp), @XMM[7]
   2373 	movdqu	@XMM[7], -16($out)
   2374 
   2375 .Lxts_enc_ret:
   2376 	lea	(%rsp), %rax
   2377 	pxor	%xmm0, %xmm0
   2378 .Lxts_enc_bzero:			# wipe key schedule [if any]
   2379 	movdqa	%xmm0, 0x00(%rax)
   2380 	movdqa	%xmm0, 0x10(%rax)
   2381 	lea	0x20(%rax), %rax
   2382 	cmp	%rax, %rbp
   2383 	ja	.Lxts_enc_bzero
   2384 
   2385 	lea	(%rbp),%rsp		# restore %rsp
   2386 ___
   2387 $code.=<<___ if ($win64);
   2388 	movaps	0x40(%rbp), %xmm6
   2389 	movaps	0x50(%rbp), %xmm7
   2390 	movaps	0x60(%rbp), %xmm8
   2391 	movaps	0x70(%rbp), %xmm9
   2392 	movaps	0x80(%rbp), %xmm10
   2393 	movaps	0x90(%rbp), %xmm11
   2394 	movaps	0xa0(%rbp), %xmm12
   2395 	movaps	0xb0(%rbp), %xmm13
   2396 	movaps	0xc0(%rbp), %xmm14
   2397 	movaps	0xd0(%rbp), %xmm15
   2398 	lea	0xa0(%rbp), %rsp
   2399 ___
   2400 $code.=<<___;
   2401 	mov	0x48(%rsp), %r15
   2402 	mov	0x50(%rsp), %r14
   2403 	mov	0x58(%rsp), %r13
   2404 	mov	0x60(%rsp), %r12
   2405 	mov	0x68(%rsp), %rbx
   2406 	mov	0x70(%rsp), %rax
   2407 	lea	0x78(%rsp), %rsp
   2408 	mov	%rax, %rbp
   2409 .Lxts_enc_epilogue:
   2410 	ret
   2411 .size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
   2412 
   2413 .globl	bsaes_xts_decrypt
   2414 .type	bsaes_xts_decrypt,\@abi-omnipotent
   2415 .align	16
   2416 bsaes_xts_decrypt:
   2417 	mov	%rsp, %rax
   2418 .Lxts_dec_prologue:
   2419 	push	%rbp
   2420 	push	%rbx
   2421 	push	%r12
   2422 	push	%r13
   2423 	push	%r14
   2424 	push	%r15
   2425 	lea	-0x48(%rsp), %rsp
   2426 ___
   2427 $code.=<<___ if ($win64);
   2428 	mov	0xa0(%rsp),$arg5	# pull key2
   2429 	mov	0xa8(%rsp),$arg6	# pull ivp
   2430 	lea	-0xa0(%rsp), %rsp
   2431 	movaps	%xmm6, 0x40(%rsp)
   2432 	movaps	%xmm7, 0x50(%rsp)
   2433 	movaps	%xmm8, 0x60(%rsp)
   2434 	movaps	%xmm9, 0x70(%rsp)
   2435 	movaps	%xmm10, 0x80(%rsp)
   2436 	movaps	%xmm11, 0x90(%rsp)
   2437 	movaps	%xmm12, 0xa0(%rsp)
   2438 	movaps	%xmm13, 0xb0(%rsp)
   2439 	movaps	%xmm14, 0xc0(%rsp)
   2440 	movaps	%xmm15, 0xd0(%rsp)
   2441 .Lxts_dec_body:
   2442 ___
   2443 $code.=<<___;
   2444 	mov	%rsp, %rbp		# backup %rsp
   2445 	mov	$arg1, $inp		# backup arguments
   2446 	mov	$arg2, $out
   2447 	mov	$arg3, $len
   2448 	mov	$arg4, $key
   2449 
   2450 	lea	($arg6), $arg1
   2451 	lea	0x20(%rbp), $arg2
   2452 	lea	($arg5), $arg3
   2453 	call	asm_AES_encrypt		# generate initial tweak
   2454 
   2455 	mov	240($key), %eax		# rounds
   2456 	mov	$len, %rbx		# backup $len
   2457 
   2458 	mov	%eax, %edx		# rounds
   2459 	shl	\$7, %rax		# 128 bytes per inner round key
   2460 	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
   2461 	sub	%rax, %rsp
   2462 
   2463 	mov	%rsp, %rax		# pass key schedule
   2464 	mov	$key, %rcx		# pass key
   2465 	mov	%edx, %r10d		# pass rounds
   2466 	call	_bsaes_key_convert
   2467 	pxor	(%rsp), %xmm7		# fix up round 0 key
   2468 	movdqa	%xmm6, (%rax)		# save last round key
   2469 	movdqa	%xmm7, (%rsp)
   2470 
   2471 	xor	%eax, %eax		# if ($len%16) len-=16;
   2472 	and	\$-16, $len
   2473 	test	\$15, %ebx
   2474 	setnz	%al
   2475 	shl	\$4, %rax
   2476 	sub	%rax, $len
   2477 
   2478 	sub	\$0x80, %rsp		# place for tweak[8]
   2479 	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
   2480 
   2481 	pxor	$twtmp, $twtmp
   2482 	movdqa	.Lxts_magic(%rip), $twmask
   2483 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2484 
   2485 	sub	\$0x80, $len
   2486 	jc	.Lxts_dec_short
   2487 	jmp	.Lxts_dec_loop
   2488 
   2489 .align	16
   2490 .Lxts_dec_loop:
   2491 ___
   2492     for ($i=0;$i<7;$i++) {
   2493     $code.=<<___;
   2494 	pshufd	\$0x13, $twtmp, $twres
   2495 	pxor	$twtmp, $twtmp
   2496 	movdqa	@XMM[7], @XMM[$i]
   2497 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2498 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2499 	pand	$twmask, $twres		# isolate carry and residue
   2500 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2501 	pxor	$twres, @XMM[7]
   2502 ___
   2503     $code.=<<___ if ($i>=1);
   2504 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2505 ___
   2506     $code.=<<___ if ($i>=2);
   2507 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2508 ___
   2509     }
   2510 $code.=<<___;
   2511 	movdqu	0x60($inp), @XMM[8+6]
   2512 	pxor	@XMM[8+5], @XMM[5]
   2513 	movdqu	0x70($inp), @XMM[8+7]
   2514 	lea	0x80($inp), $inp
   2515 	movdqa	@XMM[7], 0x70(%rsp)
   2516 	pxor	@XMM[8+6], @XMM[6]
   2517 	lea	0x80(%rsp), %rax	# pass key schedule
   2518 	pxor	@XMM[8+7], @XMM[7]
   2519 	mov	%edx, %r10d		# pass rounds
   2520 
   2521 	call	_bsaes_decrypt8
   2522 
   2523 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2524 	pxor	0x10(%rsp), @XMM[1]
   2525 	movdqu	@XMM[0], 0x00($out)	# write output
   2526 	pxor	0x20(%rsp), @XMM[6]
   2527 	movdqu	@XMM[1], 0x10($out)
   2528 	pxor	0x30(%rsp), @XMM[4]
   2529 	movdqu	@XMM[6], 0x20($out)
   2530 	pxor	0x40(%rsp), @XMM[2]
   2531 	movdqu	@XMM[4], 0x30($out)
   2532 	pxor	0x50(%rsp), @XMM[7]
   2533 	movdqu	@XMM[2], 0x40($out)
   2534 	pxor	0x60(%rsp), @XMM[3]
   2535 	movdqu	@XMM[7], 0x50($out)
   2536 	pxor	0x70(%rsp), @XMM[5]
   2537 	movdqu	@XMM[3], 0x60($out)
   2538 	movdqu	@XMM[5], 0x70($out)
   2539 	lea	0x80($out), $out
   2540 
   2541 	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
   2542 	pxor	$twtmp, $twtmp
   2543 	movdqa	.Lxts_magic(%rip), $twmask
   2544 	pcmpgtd	@XMM[7], $twtmp
   2545 	pshufd	\$0x13, $twtmp, $twres
   2546 	pxor	$twtmp, $twtmp
   2547 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2548 	pand	$twmask, $twres		# isolate carry and residue
   2549 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2550 	pxor	$twres, @XMM[7]
   2551 
   2552 	sub	\$0x80,$len
   2553 	jnc	.Lxts_dec_loop
   2554 
   2555 .Lxts_dec_short:
   2556 	add	\$0x80, $len
   2557 	jz	.Lxts_dec_done
   2558 ___
   2559     for ($i=0;$i<7;$i++) {
   2560     $code.=<<___;
   2561 	pshufd	\$0x13, $twtmp, $twres
   2562 	pxor	$twtmp, $twtmp
   2563 	movdqa	@XMM[7], @XMM[$i]
   2564 	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
   2565 	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
   2566 	pand	$twmask, $twres		# isolate carry and residue
   2567 	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
   2568 	pxor	$twres, @XMM[7]
   2569 ___
   2570     $code.=<<___ if ($i>=1);
   2571 	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
   2572 	cmp	\$`0x10*$i`,$len
   2573 	je	.Lxts_dec_$i
   2574 ___
   2575     $code.=<<___ if ($i>=2);
   2576 	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
   2577 ___
   2578     }
   2579 $code.=<<___;
   2580 	movdqu	0x60($inp), @XMM[8+6]
   2581 	pxor	@XMM[8+5], @XMM[5]
   2582 	movdqa	@XMM[7], 0x70(%rsp)
   2583 	lea	0x70($inp), $inp
   2584 	pxor	@XMM[8+6], @XMM[6]
   2585 	lea	0x80(%rsp), %rax	# pass key schedule
   2586 	mov	%edx, %r10d		# pass rounds
   2587 
   2588 	call	_bsaes_decrypt8
   2589 
   2590 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2591 	pxor	0x10(%rsp), @XMM[1]
   2592 	movdqu	@XMM[0], 0x00($out)	# write output
   2593 	pxor	0x20(%rsp), @XMM[6]
   2594 	movdqu	@XMM[1], 0x10($out)
   2595 	pxor	0x30(%rsp), @XMM[4]
   2596 	movdqu	@XMM[6], 0x20($out)
   2597 	pxor	0x40(%rsp), @XMM[2]
   2598 	movdqu	@XMM[4], 0x30($out)
   2599 	pxor	0x50(%rsp), @XMM[7]
   2600 	movdqu	@XMM[2], 0x40($out)
   2601 	pxor	0x60(%rsp), @XMM[3]
   2602 	movdqu	@XMM[7], 0x50($out)
   2603 	movdqu	@XMM[3], 0x60($out)
   2604 	lea	0x70($out), $out
   2605 
   2606 	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
   2607 	jmp	.Lxts_dec_done
   2608 .align	16
   2609 .Lxts_dec_6:
   2610 	pxor	@XMM[8+4], @XMM[4]
   2611 	lea	0x60($inp), $inp
   2612 	pxor	@XMM[8+5], @XMM[5]
   2613 	lea	0x80(%rsp), %rax	# pass key schedule
   2614 	mov	%edx, %r10d		# pass rounds
   2615 
   2616 	call	_bsaes_decrypt8
   2617 
   2618 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2619 	pxor	0x10(%rsp), @XMM[1]
   2620 	movdqu	@XMM[0], 0x00($out)	# write output
   2621 	pxor	0x20(%rsp), @XMM[6]
   2622 	movdqu	@XMM[1], 0x10($out)
   2623 	pxor	0x30(%rsp), @XMM[4]
   2624 	movdqu	@XMM[6], 0x20($out)
   2625 	pxor	0x40(%rsp), @XMM[2]
   2626 	movdqu	@XMM[4], 0x30($out)
   2627 	pxor	0x50(%rsp), @XMM[7]
   2628 	movdqu	@XMM[2], 0x40($out)
   2629 	movdqu	@XMM[7], 0x50($out)
   2630 	lea	0x60($out), $out
   2631 
   2632 	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
   2633 	jmp	.Lxts_dec_done
   2634 .align	16
   2635 .Lxts_dec_5:
   2636 	pxor	@XMM[8+3], @XMM[3]
   2637 	lea	0x50($inp), $inp
   2638 	pxor	@XMM[8+4], @XMM[4]
   2639 	lea	0x80(%rsp), %rax	# pass key schedule
   2640 	mov	%edx, %r10d		# pass rounds
   2641 
   2642 	call	_bsaes_decrypt8
   2643 
   2644 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2645 	pxor	0x10(%rsp), @XMM[1]
   2646 	movdqu	@XMM[0], 0x00($out)	# write output
   2647 	pxor	0x20(%rsp), @XMM[6]
   2648 	movdqu	@XMM[1], 0x10($out)
   2649 	pxor	0x30(%rsp), @XMM[4]
   2650 	movdqu	@XMM[6], 0x20($out)
   2651 	pxor	0x40(%rsp), @XMM[2]
   2652 	movdqu	@XMM[4], 0x30($out)
   2653 	movdqu	@XMM[2], 0x40($out)
   2654 	lea	0x50($out), $out
   2655 
   2656 	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
   2657 	jmp	.Lxts_dec_done
   2658 .align	16
   2659 .Lxts_dec_4:
   2660 	pxor	@XMM[8+2], @XMM[2]
   2661 	lea	0x40($inp), $inp
   2662 	pxor	@XMM[8+3], @XMM[3]
   2663 	lea	0x80(%rsp), %rax	# pass key schedule
   2664 	mov	%edx, %r10d		# pass rounds
   2665 
   2666 	call	_bsaes_decrypt8
   2667 
   2668 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2669 	pxor	0x10(%rsp), @XMM[1]
   2670 	movdqu	@XMM[0], 0x00($out)	# write output
   2671 	pxor	0x20(%rsp), @XMM[6]
   2672 	movdqu	@XMM[1], 0x10($out)
   2673 	pxor	0x30(%rsp), @XMM[4]
   2674 	movdqu	@XMM[6], 0x20($out)
   2675 	movdqu	@XMM[4], 0x30($out)
   2676 	lea	0x40($out), $out
   2677 
   2678 	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
   2679 	jmp	.Lxts_dec_done
   2680 .align	16
   2681 .Lxts_dec_3:
   2682 	pxor	@XMM[8+1], @XMM[1]
   2683 	lea	0x30($inp), $inp
   2684 	pxor	@XMM[8+2], @XMM[2]
   2685 	lea	0x80(%rsp), %rax	# pass key schedule
   2686 	mov	%edx, %r10d		# pass rounds
   2687 
   2688 	call	_bsaes_decrypt8
   2689 
   2690 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2691 	pxor	0x10(%rsp), @XMM[1]
   2692 	movdqu	@XMM[0], 0x00($out)	# write output
   2693 	pxor	0x20(%rsp), @XMM[6]
   2694 	movdqu	@XMM[1], 0x10($out)
   2695 	movdqu	@XMM[6], 0x20($out)
   2696 	lea	0x30($out), $out
   2697 
   2698 	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
   2699 	jmp	.Lxts_dec_done
   2700 .align	16
   2701 .Lxts_dec_2:
   2702 	pxor	@XMM[8+0], @XMM[0]
   2703 	lea	0x20($inp), $inp
   2704 	pxor	@XMM[8+1], @XMM[1]
   2705 	lea	0x80(%rsp), %rax	# pass key schedule
   2706 	mov	%edx, %r10d		# pass rounds
   2707 
   2708 	call	_bsaes_decrypt8
   2709 
   2710 	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2711 	pxor	0x10(%rsp), @XMM[1]
   2712 	movdqu	@XMM[0], 0x00($out)	# write output
   2713 	movdqu	@XMM[1], 0x10($out)
   2714 	lea	0x20($out), $out
   2715 
   2716 	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
   2717 	jmp	.Lxts_dec_done
   2718 .align	16
   2719 .Lxts_dec_1:
   2720 	pxor	@XMM[0], @XMM[8]
   2721 	lea	0x10($inp), $inp
   2722 	movdqa	@XMM[8], 0x20(%rbp)
   2723 	lea	0x20(%rbp), $arg1
   2724 	lea	0x20(%rbp), $arg2
   2725 	lea	($key), $arg3
   2726 	call	asm_AES_decrypt		# doesn't touch %xmm
   2727 	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
   2728 	#pxor	@XMM[8], @XMM[0]
   2729 	#lea	0x80(%rsp), %rax	# pass key schedule
   2730 	#mov	%edx, %r10d		# pass rounds
   2731 	#call	_bsaes_decrypt8
   2732 	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
   2733 	movdqu	@XMM[0], 0x00($out)	# write output
   2734 	lea	0x10($out), $out
   2735 
   2736 	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
   2737 
   2738 .Lxts_dec_done:
   2739 	and	\$15, %ebx
   2740 	jz	.Lxts_dec_ret
   2741 
   2742 	pxor	$twtmp, $twtmp
   2743 	movdqa	.Lxts_magic(%rip), $twmask
   2744 	pcmpgtd	@XMM[7], $twtmp
   2745 	pshufd	\$0x13, $twtmp, $twres
   2746 	movdqa	@XMM[7], @XMM[6]
   2747 	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
   2748 	pand	$twmask, $twres		# isolate carry and residue
   2749 	movdqu	($inp), @XMM[0]
   2750 	pxor	$twres, @XMM[7]
   2751 
   2752 	lea	0x20(%rbp), $arg1
   2753 	pxor	@XMM[7], @XMM[0]
   2754 	lea	0x20(%rbp), $arg2
   2755 	movdqa	@XMM[0], 0x20(%rbp)
   2756 	lea	($key), $arg3
   2757 	call	asm_AES_decrypt		# doesn't touch %xmm
   2758 	pxor	0x20(%rbp), @XMM[7]
   2759 	mov	$out, %rdx
   2760 	movdqu	@XMM[7], ($out)
   2761 
   2762 .Lxts_dec_steal:
   2763 	movzb	16($inp), %eax
   2764 	movzb	(%rdx), %ecx
   2765 	lea	1($inp), $inp
   2766 	mov	%al, (%rdx)
   2767 	mov	%cl, 16(%rdx)
   2768 	lea	1(%rdx), %rdx
   2769 	sub	\$1,%ebx
   2770 	jnz	.Lxts_dec_steal
   2771 
   2772 	movdqu	($out), @XMM[0]
   2773 	lea	0x20(%rbp), $arg1
   2774 	pxor	@XMM[6], @XMM[0]
   2775 	lea	0x20(%rbp), $arg2
   2776 	movdqa	@XMM[0], 0x20(%rbp)
   2777 	lea	($key), $arg3
   2778 	call	asm_AES_decrypt		# doesn't touch %xmm
   2779 	pxor	0x20(%rbp), @XMM[6]
   2780 	movdqu	@XMM[6], ($out)
   2781 
   2782 .Lxts_dec_ret:
   2783 	lea	(%rsp), %rax
   2784 	pxor	%xmm0, %xmm0
   2785 .Lxts_dec_bzero:			# wipe key schedule [if any]
   2786 	movdqa	%xmm0, 0x00(%rax)
   2787 	movdqa	%xmm0, 0x10(%rax)
   2788 	lea	0x20(%rax), %rax
   2789 	cmp	%rax, %rbp
   2790 	ja	.Lxts_dec_bzero
   2791 
   2792 	lea	(%rbp),%rsp		# restore %rsp
   2793 ___
   2794 $code.=<<___ if ($win64);
   2795 	movaps	0x40(%rbp), %xmm6
   2796 	movaps	0x50(%rbp), %xmm7
   2797 	movaps	0x60(%rbp), %xmm8
   2798 	movaps	0x70(%rbp), %xmm9
   2799 	movaps	0x80(%rbp), %xmm10
   2800 	movaps	0x90(%rbp), %xmm11
   2801 	movaps	0xa0(%rbp), %xmm12
   2802 	movaps	0xb0(%rbp), %xmm13
   2803 	movaps	0xc0(%rbp), %xmm14
   2804 	movaps	0xd0(%rbp), %xmm15
   2805 	lea	0xa0(%rbp), %rsp
   2806 ___
   2807 $code.=<<___;
   2808 	mov	0x48(%rsp), %r15
   2809 	mov	0x50(%rsp), %r14
   2810 	mov	0x58(%rsp), %r13
   2811 	mov	0x60(%rsp), %r12
   2812 	mov	0x68(%rsp), %rbx
   2813 	mov	0x70(%rsp), %rax
   2814 	lea	0x78(%rsp), %rsp
   2815 	mov	%rax, %rbp
   2816 .Lxts_dec_epilogue:
   2817 	ret
   2818 .size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
   2819 ___
   2820 }
   2821 $code.=<<___;
   2822 .type	_bsaes_const,\@object
   2823 .align	64
   2824 _bsaes_const:
   2825 .LM0ISR:	# InvShiftRows constants
   2826 	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
   2827 .LISRM0:
   2828 	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
   2829 .LISR:
   2830 	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
   2831 .LBS0:		# bit-slice constants
   2832 	.quad	0x5555555555555555, 0x5555555555555555
   2833 .LBS1:
   2834 	.quad	0x3333333333333333, 0x3333333333333333
   2835 .LBS2:
   2836 	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
   2837 .LSR:		# shiftrows constants
   2838 	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
   2839 .LSRM0:
   2840 	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
   2841 .LM0SR:
   2842 	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
   2843 .LSWPUP:	# byte-swap upper dword
   2844 	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
   2845 .LSWPUPM0SR:
   2846 	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
   2847 .LADD1:		# counter increment constants
   2848 	.quad	0x0000000000000000, 0x0000000100000000
   2849 .LADD2:
   2850 	.quad	0x0000000000000000, 0x0000000200000000
   2851 .LADD3:
   2852 	.quad	0x0000000000000000, 0x0000000300000000
   2853 .LADD4:
   2854 	.quad	0x0000000000000000, 0x0000000400000000
   2855 .LADD5:
   2856 	.quad	0x0000000000000000, 0x0000000500000000
   2857 .LADD6:
   2858 	.quad	0x0000000000000000, 0x0000000600000000
   2859 .LADD7:
   2860 	.quad	0x0000000000000000, 0x0000000700000000
   2861 .LADD8:
   2862 	.quad	0x0000000000000000, 0x0000000800000000
   2863 .Lxts_magic:
   2864 	.long	0x87,0,1,0
   2865 .Lmasks:
   2866 	.quad	0x0101010101010101, 0x0101010101010101
   2867 	.quad	0x0202020202020202, 0x0202020202020202
   2868 	.quad	0x0404040404040404, 0x0404040404040404
   2869 	.quad	0x0808080808080808, 0x0808080808080808
   2870 .LM0:
   2871 	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
   2872 .L63:
   2873 	.quad	0x6363636363636363, 0x6363636363636363
   2874 .asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Ksper, Peter Schwabe, Andy Polyakov"
   2875 .align	64
   2876 .size	_bsaes_const,.-_bsaes_const
   2877 ___
   2878 
   2879 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2880 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2881 if ($win64) {
   2882 $rec="%rcx";
   2883 $frame="%rdx";
   2884 $context="%r8";
   2885 $disp="%r9";
   2886 
   2887 $code.=<<___;
   2888 .extern	__imp_RtlVirtualUnwind
   2889 .type	se_handler,\@abi-omnipotent
   2890 .align	16
   2891 se_handler:
   2892 	push	%rsi
   2893 	push	%rdi
   2894 	push	%rbx
   2895 	push	%rbp
   2896 	push	%r12
   2897 	push	%r13
   2898 	push	%r14
   2899 	push	%r15
   2900 	pushfq
   2901 	sub	\$64,%rsp
   2902 
   2903 	mov	120($context),%rax	# pull context->Rax
   2904 	mov	248($context),%rbx	# pull context->Rip
   2905 
   2906 	mov	8($disp),%rsi		# disp->ImageBase
   2907 	mov	56($disp),%r11		# disp->HandlerData
   2908 
   2909 	mov	0(%r11),%r10d		# HandlerData[0]
   2910 	lea	(%rsi,%r10),%r10	# prologue label
   2911 	cmp	%r10,%rbx		# context->Rip<prologue label
   2912 	jb	.Lin_prologue
   2913 
   2914 	mov	152($context),%rax	# pull context->Rsp
   2915 
   2916 	mov	4(%r11),%r10d		# HandlerData[1]
   2917 	lea	(%rsi,%r10),%r10	# epilogue label
   2918 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2919 	jae	.Lin_prologue
   2920 
   2921 	mov	160($context),%rax	# pull context->Rbp
   2922 
   2923 	lea	0x40(%rax),%rsi		# %xmm save area
   2924 	lea	512($context),%rdi	# &context.Xmm6
   2925 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
   2926 	.long	0xa548f3fc		# cld; rep movsq
   2927 	lea	0xa0(%rax),%rax		# adjust stack pointer
   2928 
   2929 	mov	0x70(%rax),%rbp
   2930 	mov	0x68(%rax),%rbx
   2931 	mov	0x60(%rax),%r12
   2932 	mov	0x58(%rax),%r13
   2933 	mov	0x50(%rax),%r14
   2934 	mov	0x48(%rax),%r15
   2935 	lea	0x78(%rax),%rax		# adjust stack pointer
   2936 	mov	%rbx,144($context)	# restore context->Rbx
   2937 	mov	%rbp,160($context)	# restore context->Rbp
   2938 	mov	%r12,216($context)	# restore context->R12
   2939 	mov	%r13,224($context)	# restore context->R13
   2940 	mov	%r14,232($context)	# restore context->R14
   2941 	mov	%r15,240($context)	# restore context->R15
   2942 
   2943 .Lin_prologue:
   2944 	mov	%rax,152($context)	# restore context->Rsp
   2945 
   2946 	mov	40($disp),%rdi		# disp->ContextRecord
   2947 	mov	$context,%rsi		# context
   2948 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   2949 	.long	0xa548f3fc		# cld; rep movsq
   2950 
   2951 	mov	$disp,%rsi
   2952 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2953 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2954 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2955 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2956 	mov	40(%rsi),%r10		# disp->ContextRecord
   2957 	lea	56(%rsi),%r11		# &disp->HandlerData
   2958 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2959 	mov	%r10,32(%rsp)		# arg5
   2960 	mov	%r11,40(%rsp)		# arg6
   2961 	mov	%r12,48(%rsp)		# arg7
   2962 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2963 	call	*__imp_RtlVirtualUnwind(%rip)
   2964 
   2965 	mov	\$1,%eax		# ExceptionContinueSearch
   2966 	add	\$64,%rsp
   2967 	popfq
   2968 	pop	%r15
   2969 	pop	%r14
   2970 	pop	%r13
   2971 	pop	%r12
   2972 	pop	%rbp
   2973 	pop	%rbx
   2974 	pop	%rdi
   2975 	pop	%rsi
   2976 	ret
   2977 .size	se_handler,.-se_handler
   2978 
   2979 .section	.pdata
   2980 .align	4
   2981 ___
   2982 $code.=<<___ if ($ecb);
   2983 	.rva	.Lecb_enc_prologue
   2984 	.rva	.Lecb_enc_epilogue
   2985 	.rva	.Lecb_enc_info
   2986 
   2987 	.rva	.Lecb_dec_prologue
   2988 	.rva	.Lecb_dec_epilogue
   2989 	.rva	.Lecb_dec_info
   2990 ___
   2991 $code.=<<___;
   2992 	.rva	.Lcbc_dec_prologue
   2993 	.rva	.Lcbc_dec_epilogue
   2994 	.rva	.Lcbc_dec_info
   2995 
   2996 	.rva	.Lctr_enc_prologue
   2997 	.rva	.Lctr_enc_epilogue
   2998 	.rva	.Lctr_enc_info
   2999 
   3000 	.rva	.Lxts_enc_prologue
   3001 	.rva	.Lxts_enc_epilogue
   3002 	.rva	.Lxts_enc_info
   3003 
   3004 	.rva	.Lxts_dec_prologue
   3005 	.rva	.Lxts_dec_epilogue
   3006 	.rva	.Lxts_dec_info
   3007 
   3008 .section	.xdata
   3009 .align	8
   3010 ___
   3011 $code.=<<___ if ($ecb);
   3012 .Lecb_enc_info:
   3013 	.byte	9,0,0,0
   3014 	.rva	se_handler
   3015 	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
   3016 .Lecb_dec_info:
   3017 	.byte	9,0,0,0
   3018 	.rva	se_handler
   3019 	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
   3020 ___
   3021 $code.=<<___;
   3022 .Lcbc_dec_info:
   3023 	.byte	9,0,0,0
   3024 	.rva	se_handler
   3025 	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
   3026 .Lctr_enc_info:
   3027 	.byte	9,0,0,0
   3028 	.rva	se_handler
   3029 	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
   3030 .Lxts_enc_info:
   3031 	.byte	9,0,0,0
   3032 	.rva	se_handler
   3033 	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
   3034 .Lxts_dec_info:
   3035 	.byte	9,0,0,0
   3036 	.rva	se_handler
   3037 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
   3038 ___
   3039 }
   3040 
   3041 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   3042 
   3043 print $code;
   3044 
   3045 close STDOUT;
   3046