Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # This module implements support for Intel AES-NI extension. In
     11 # OpenSSL context it's used with Intel engine, but can also be used as
     12 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
     13 # details].
     14 #
     15 # Performance.
     16 #
     17 # Given aes(enc|dec) instructions' latency asymptotic performance for
     18 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
     19 # processed with 128-bit key. And given their throughput asymptotic
     20 # performance for parallelizable modes is 1.25 cycles per byte. Being
     21 # asymptotic limit it's not something you commonly achieve in reality,
     22 # but how close does one get? Below are results collected for
     23 # different modes and block sized. Pairs of numbers are for en-/
     24 # decryption.
     25 #
     26 #	16-byte     64-byte     256-byte    1-KB        8-KB
     27 # ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
     28 # CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
     29 # CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
     30 # CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
     31 # OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
     32 # CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
     33 #
     34 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
     35 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
     36 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
     37 # The results were collected with specially crafted speed.c benchmark
     38 # in order to compare them with results reported in "Intel Advanced
     39 # Encryption Standard (AES) New Instruction Set" White Paper Revision
     40 # 3.0 dated May 2010. All above results are consistently better. This
     41 # module also provides better performance for block sizes smaller than
     42 # 128 bytes in points *not* represented in the above table.
     43 #
     44 # Looking at the results for 8-KB buffer.
     45 #
     46 # CFB and OFB results are far from the limit, because implementation
     47 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
     48 # single-block aesni_encrypt, which is not the most optimal way to go.
     49 # CBC encrypt result is unexpectedly high and there is no documented
     50 # explanation for it. Seemingly there is a small penalty for feeding
     51 # the result back to AES unit the way it's done in CBC mode. There is
     52 # nothing one can do and the result appears optimal. CCM result is
     53 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
     54 # saving output. CCM CTR "stays invisible," because it's neatly
     55 # interleaved wih CBC-MAC. This provides ~30% improvement over
     56 # "straghtforward" CCM implementation with CTR and CBC-MAC performed
     57 # disjointly. Parallelizable modes practically achieve the theoretical
     58 # limit.
     59 #
     60 # Looking at how results vary with buffer size.
     61 #
     62 # Curves are practically saturated at 1-KB buffer size. In most cases
     63 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
     64 # CTR curve doesn't follow this pattern and is "slowest" changing one
     65 # with "256-byte" result being 87% of "8-KB." This is because overhead
     66 # in CTR mode is most computationally intensive. Small-block CCM
     67 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
     68 # iterations can't be interleaved.
     69 #
     70 # Results for 192- and 256-bit keys.
     71 #
     72 # EVP-free results were observed to scale perfectly with number of
     73 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
     74 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
     75 # are a tad smaller, because the above mentioned penalty biases all
     76 # results by same constant value. In similar way function call
     77 # overhead affects small-block performance, as well as OFB and CFB
     78 # results. Differences are not large, most common coefficients are
     79 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
     80 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
     81 
     82 # January 2011
     83 #
     84 # While Westmere processor features 6 cycles latency for aes[enc|dec]
     85 # instructions, which can be scheduled every second cycle, Sandy
     86 # Bridge spends 8 cycles per instruction, but it can schedule them
     87 # every cycle. This means that code targeting Westmere would perform
     88 # suboptimally on Sandy Bridge. Therefore this update.
     89 #
     90 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
     91 # optimized. Relative improvement might appear modest, 8% on Westmere,
     92 # but in absolute terms it's 3.77 cycles per byte encrypted with
     93 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
     94 # should be compared to asymptotic limits of 3.75 for Westmere and
     95 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
     96 # to asymptotic limits is quite amazing. Indeed, the limit is
     97 # calculated as latency times number of rounds, 10 for 128-bit key,
     98 # and divided by 16, the number of bytes in block, or in other words
     99 # it accounts *solely* for aesenc instructions. But there are extra
    100 # instructions, and numbers so close to the asymptotic limits mean
    101 # that it's as if it takes as little as *one* additional cycle to
    102 # execute all of them. How is it possible? It is possible thanks to
    103 # out-of-order execution logic, which manages to overlap post-
    104 # processing of previous block, things like saving the output, with
    105 # actual encryption of current block, as well as pre-processing of
    106 # current block, things like fetching input and xor-ing it with
    107 # 0-round element of the key schedule, with actual encryption of
    108 # previous block. Keep this in mind...
    109 #
    110 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
    111 # performance is achieved by interleaving instructions working on
    112 # independent blocks. In which case asymptotic limit for such modes
    113 # can be obtained by dividing above mentioned numbers by AES
    114 # instructions' interleave factor. Westmere can execute at most 3 
    115 # instructions at a time, meaning that optimal interleave factor is 3,
    116 # and that's where the "magic" number of 1.25 come from. "Optimal
    117 # interleave factor" means that increase of interleave factor does
    118 # not improve performance. The formula has proven to reflect reality
    119 # pretty well on Westmere... Sandy Bridge on the other hand can
    120 # execute up to 8 AES instructions at a time, so how does varying
    121 # interleave factor affect the performance? Here is table for ECB
    122 # (numbers are cycles per byte processed with 128-bit key):
    123 #
    124 # instruction interleave factor		3x	6x	8x
    125 # theoretical asymptotic limit		1.67	0.83	0.625
    126 # measured performance for 8KB block	1.05	0.86	0.84
    127 #
    128 # "as if" interleave factor		4.7x	5.8x	6.0x
    129 #
    130 # Further data for other parallelizable modes:
    131 #
    132 # CBC decrypt				1.16	0.93	0.93
    133 # CTR					1.14	0.91	n/a
    134 #
    135 # Well, given 3x column it's probably inappropriate to call the limit
    136 # asymptotic, if it can be surpassed, isn't it? What happens there?
    137 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
    138 # magic is responsible for this. Processor overlaps not only the
    139 # additional instructions with AES ones, but even AES instuctions
    140 # processing adjacent triplets of independent blocks. In the 6x case
    141 # additional instructions  still claim disproportionally small amount
    142 # of additional cycles, but in 8x case number of instructions must be
    143 # a tad too high for out-of-order logic to cope with, and AES unit
    144 # remains underutilized... As you can see 8x interleave is hardly
    145 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
    146 # utilizies 6x interleave because of limited register bank capacity.
    147 #
    148 # Higher interleave factors do have negative impact on Westmere
    149 # performance. While for ECB mode it's negligible ~1.5%, other
    150 # parallelizables perform ~5% worse, which is outweighed by ~25%
    151 # improvement on Sandy Bridge. To balance regression on Westmere
    152 # CTR mode was implemented with 6x aesenc interleave factor.
    153 
    154 # April 2011
    155 #
    156 # Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
    157 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
    158 # in CTR mode AES instruction interleave factor was chosen to be 6x.
    159 
    160 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
    161 			# generates drop-in replacement for
    162 			# crypto/aes/asm/aes-x86_64.pl:-)
    163 
    164 $flavour = shift;
    165 $output  = shift;
    166 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    167 
    168 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    169 
    170 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    171 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    172 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    173 die "can't locate x86_64-xlate.pl";
    174 
    175 open STDOUT,"| $^X $xlate $flavour $output";
    176 
    177 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
    178 @_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
    179 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
    180 
    181 $code=".text\n";
    182 
    183 $rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
    184 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
    185 $inp="%rdi";
    186 $out="%rsi";
    187 $len="%rdx";
    188 $key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
    189 $ivp="%r8";	# cbc, ctr, ...
    190 
    191 $rnds_="%r10d";	# backup copy for $rounds
    192 $key_="%r11";	# backup copy for $key
    193 
    194 # %xmm register layout
    195 $rndkey0="%xmm0";	$rndkey1="%xmm1";
    196 $inout0="%xmm2";	$inout1="%xmm3";
    197 $inout2="%xmm4";	$inout3="%xmm5";
    198 $inout4="%xmm6";	$inout5="%xmm7";
    199 $inout6="%xmm8";	$inout7="%xmm9";
    200 
    201 $in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
    202 $in0="%xmm8";		$iv="%xmm9";
    203 
    205 # Inline version of internal aesni_[en|de]crypt1.
    206 #
    207 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
    208 # cycles which take care of loop variables...
    209 { my $sn;
    210 sub aesni_generate1 {
    211 my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
    212 ++$sn;
    213 $code.=<<___;
    214 	$movkey	($key),$rndkey0
    215 	$movkey	16($key),$rndkey1
    216 ___
    217 $code.=<<___ if (defined($ivec));
    218 	xorps	$rndkey0,$ivec
    219 	lea	32($key),$key
    220 	xorps	$ivec,$inout
    221 ___
    222 $code.=<<___ if (!defined($ivec));
    223 	lea	32($key),$key
    224 	xorps	$rndkey0,$inout
    225 ___
    226 $code.=<<___;
    227 .Loop_${p}1_$sn:
    228 	aes${p}	$rndkey1,$inout
    229 	dec	$rounds
    230 	$movkey	($key),$rndkey1
    231 	lea	16($key),$key
    232 	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
    233 	aes${p}last	$rndkey1,$inout
    234 ___
    235 }}
    236 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
    237 #
    238 { my ($inp,$out,$key) = @_4args;
    239 
    240 $code.=<<___;
    241 .globl	${PREFIX}_encrypt
    242 .type	${PREFIX}_encrypt,\@abi-omnipotent
    243 .align	16
    244 ${PREFIX}_encrypt:
    245 	movups	($inp),$inout0		# load input
    246 	mov	240($key),$rounds	# key->rounds
    247 ___
    248 	&aesni_generate1("enc",$key,$rounds);
    249 $code.=<<___;
    250 	movups	$inout0,($out)		# output
    251 	ret
    252 .size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
    253 
    254 .globl	${PREFIX}_decrypt
    255 .type	${PREFIX}_decrypt,\@abi-omnipotent
    256 .align	16
    257 ${PREFIX}_decrypt:
    258 	movups	($inp),$inout0		# load input
    259 	mov	240($key),$rounds	# key->rounds
    260 ___
    261 	&aesni_generate1("dec",$key,$rounds);
    262 $code.=<<___;
    263 	movups	$inout0,($out)		# output
    264 	ret
    265 .size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
    266 ___
    267 }
    268 
    270 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
    271 # factor. Why 3x subroutine were originally used in loops? Even though
    272 # aes[enc|dec] latency was originally 6, it could be scheduled only
    273 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
    274 # utilization, i.e. when subroutine's throughput is virtually same as
    275 # of non-interleaved subroutine [for number of input blocks up to 3].
    276 # This is why it makes no sense to implement 2x subroutine.
    277 # aes[enc|dec] latency in next processor generation is 8, but the
    278 # instructions can be scheduled every cycle. Optimal interleave for
    279 # new processor is therefore 8x...
    280 sub aesni_generate3 {
    281 my $dir=shift;
    282 # As already mentioned it takes in $key and $rounds, which are *not*
    283 # preserved. $inout[0-2] is cipher/clear text...
    284 $code.=<<___;
    285 .type	_aesni_${dir}rypt3,\@abi-omnipotent
    286 .align	16
    287 _aesni_${dir}rypt3:
    288 	$movkey	($key),$rndkey0
    289 	shr	\$1,$rounds
    290 	$movkey	16($key),$rndkey1
    291 	lea	32($key),$key
    292 	xorps	$rndkey0,$inout0
    293 	xorps	$rndkey0,$inout1
    294 	xorps	$rndkey0,$inout2
    295 	$movkey		($key),$rndkey0
    296 
    297 .L${dir}_loop3:
    298 	aes${dir}	$rndkey1,$inout0
    299 	aes${dir}	$rndkey1,$inout1
    300 	dec		$rounds
    301 	aes${dir}	$rndkey1,$inout2
    302 	$movkey		16($key),$rndkey1
    303 	aes${dir}	$rndkey0,$inout0
    304 	aes${dir}	$rndkey0,$inout1
    305 	lea		32($key),$key
    306 	aes${dir}	$rndkey0,$inout2
    307 	$movkey		($key),$rndkey0
    308 	jnz		.L${dir}_loop3
    309 
    310 	aes${dir}	$rndkey1,$inout0
    311 	aes${dir}	$rndkey1,$inout1
    312 	aes${dir}	$rndkey1,$inout2
    313 	aes${dir}last	$rndkey0,$inout0
    314 	aes${dir}last	$rndkey0,$inout1
    315 	aes${dir}last	$rndkey0,$inout2
    316 	ret
    317 .size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
    318 ___
    319 }
    320 # 4x interleave is implemented to improve small block performance,
    321 # most notably [and naturally] 4 block by ~30%. One can argue that one
    322 # should have implemented 5x as well, but improvement would be <20%,
    323 # so it's not worth it...
    324 sub aesni_generate4 {
    325 my $dir=shift;
    326 # As already mentioned it takes in $key and $rounds, which are *not*
    327 # preserved. $inout[0-3] is cipher/clear text...
    328 $code.=<<___;
    329 .type	_aesni_${dir}rypt4,\@abi-omnipotent
    330 .align	16
    331 _aesni_${dir}rypt4:
    332 	$movkey	($key),$rndkey0
    333 	shr	\$1,$rounds
    334 	$movkey	16($key),$rndkey1
    335 	lea	32($key),$key
    336 	xorps	$rndkey0,$inout0
    337 	xorps	$rndkey0,$inout1
    338 	xorps	$rndkey0,$inout2
    339 	xorps	$rndkey0,$inout3
    340 	$movkey	($key),$rndkey0
    341 
    342 .L${dir}_loop4:
    343 	aes${dir}	$rndkey1,$inout0
    344 	aes${dir}	$rndkey1,$inout1
    345 	dec		$rounds
    346 	aes${dir}	$rndkey1,$inout2
    347 	aes${dir}	$rndkey1,$inout3
    348 	$movkey		16($key),$rndkey1
    349 	aes${dir}	$rndkey0,$inout0
    350 	aes${dir}	$rndkey0,$inout1
    351 	lea		32($key),$key
    352 	aes${dir}	$rndkey0,$inout2
    353 	aes${dir}	$rndkey0,$inout3
    354 	$movkey		($key),$rndkey0
    355 	jnz		.L${dir}_loop4
    356 
    357 	aes${dir}	$rndkey1,$inout0
    358 	aes${dir}	$rndkey1,$inout1
    359 	aes${dir}	$rndkey1,$inout2
    360 	aes${dir}	$rndkey1,$inout3
    361 	aes${dir}last	$rndkey0,$inout0
    362 	aes${dir}last	$rndkey0,$inout1
    363 	aes${dir}last	$rndkey0,$inout2
    364 	aes${dir}last	$rndkey0,$inout3
    365 	ret
    366 .size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
    367 ___
    368 }
    369 sub aesni_generate6 {
    370 my $dir=shift;
    371 # As already mentioned it takes in $key and $rounds, which are *not*
    372 # preserved. $inout[0-5] is cipher/clear text...
    373 $code.=<<___;
    374 .type	_aesni_${dir}rypt6,\@abi-omnipotent
    375 .align	16
    376 _aesni_${dir}rypt6:
    377 	$movkey		($key),$rndkey0
    378 	shr		\$1,$rounds
    379 	$movkey		16($key),$rndkey1
    380 	lea		32($key),$key
    381 	xorps		$rndkey0,$inout0
    382 	pxor		$rndkey0,$inout1
    383 	aes${dir}	$rndkey1,$inout0
    384 	pxor		$rndkey0,$inout2
    385 	aes${dir}	$rndkey1,$inout1
    386 	pxor		$rndkey0,$inout3
    387 	aes${dir}	$rndkey1,$inout2
    388 	pxor		$rndkey0,$inout4
    389 	aes${dir}	$rndkey1,$inout3
    390 	pxor		$rndkey0,$inout5
    391 	dec		$rounds
    392 	aes${dir}	$rndkey1,$inout4
    393 	$movkey		($key),$rndkey0
    394 	aes${dir}	$rndkey1,$inout5
    395 	jmp		.L${dir}_loop6_enter
    396 .align	16
    397 .L${dir}_loop6:
    398 	aes${dir}	$rndkey1,$inout0
    399 	aes${dir}	$rndkey1,$inout1
    400 	dec		$rounds
    401 	aes${dir}	$rndkey1,$inout2
    402 	aes${dir}	$rndkey1,$inout3
    403 	aes${dir}	$rndkey1,$inout4
    404 	aes${dir}	$rndkey1,$inout5
    405 .L${dir}_loop6_enter:				# happens to be 16-byte aligned
    406 	$movkey		16($key),$rndkey1
    407 	aes${dir}	$rndkey0,$inout0
    408 	aes${dir}	$rndkey0,$inout1
    409 	lea		32($key),$key
    410 	aes${dir}	$rndkey0,$inout2
    411 	aes${dir}	$rndkey0,$inout3
    412 	aes${dir}	$rndkey0,$inout4
    413 	aes${dir}	$rndkey0,$inout5
    414 	$movkey		($key),$rndkey0
    415 	jnz		.L${dir}_loop6
    416 
    417 	aes${dir}	$rndkey1,$inout0
    418 	aes${dir}	$rndkey1,$inout1
    419 	aes${dir}	$rndkey1,$inout2
    420 	aes${dir}	$rndkey1,$inout3
    421 	aes${dir}	$rndkey1,$inout4
    422 	aes${dir}	$rndkey1,$inout5
    423 	aes${dir}last	$rndkey0,$inout0
    424 	aes${dir}last	$rndkey0,$inout1
    425 	aes${dir}last	$rndkey0,$inout2
    426 	aes${dir}last	$rndkey0,$inout3
    427 	aes${dir}last	$rndkey0,$inout4
    428 	aes${dir}last	$rndkey0,$inout5
    429 	ret
    430 .size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
    431 ___
    432 }
    433 sub aesni_generate8 {
    434 my $dir=shift;
    435 # As already mentioned it takes in $key and $rounds, which are *not*
    436 # preserved. $inout[0-7] is cipher/clear text...
    437 $code.=<<___;
    438 .type	_aesni_${dir}rypt8,\@abi-omnipotent
    439 .align	16
    440 _aesni_${dir}rypt8:
    441 	$movkey		($key),$rndkey0
    442 	shr		\$1,$rounds
    443 	$movkey		16($key),$rndkey1
    444 	lea		32($key),$key
    445 	xorps		$rndkey0,$inout0
    446 	xorps		$rndkey0,$inout1
    447 	aes${dir}	$rndkey1,$inout0
    448 	pxor		$rndkey0,$inout2
    449 	aes${dir}	$rndkey1,$inout1
    450 	pxor		$rndkey0,$inout3
    451 	aes${dir}	$rndkey1,$inout2
    452 	pxor		$rndkey0,$inout4
    453 	aes${dir}	$rndkey1,$inout3
    454 	pxor		$rndkey0,$inout5
    455 	dec		$rounds
    456 	aes${dir}	$rndkey1,$inout4
    457 	pxor		$rndkey0,$inout6
    458 	aes${dir}	$rndkey1,$inout5
    459 	pxor		$rndkey0,$inout7
    460 	$movkey		($key),$rndkey0
    461 	aes${dir}	$rndkey1,$inout6
    462 	aes${dir}	$rndkey1,$inout7
    463 	$movkey		16($key),$rndkey1
    464 	jmp		.L${dir}_loop8_enter
    465 .align	16
    466 .L${dir}_loop8:
    467 	aes${dir}	$rndkey1,$inout0
    468 	aes${dir}	$rndkey1,$inout1
    469 	dec		$rounds
    470 	aes${dir}	$rndkey1,$inout2
    471 	aes${dir}	$rndkey1,$inout3
    472 	aes${dir}	$rndkey1,$inout4
    473 	aes${dir}	$rndkey1,$inout5
    474 	aes${dir}	$rndkey1,$inout6
    475 	aes${dir}	$rndkey1,$inout7
    476 	$movkey		16($key),$rndkey1
    477 .L${dir}_loop8_enter:				# happens to be 16-byte aligned
    478 	aes${dir}	$rndkey0,$inout0
    479 	aes${dir}	$rndkey0,$inout1
    480 	lea		32($key),$key
    481 	aes${dir}	$rndkey0,$inout2
    482 	aes${dir}	$rndkey0,$inout3
    483 	aes${dir}	$rndkey0,$inout4
    484 	aes${dir}	$rndkey0,$inout5
    485 	aes${dir}	$rndkey0,$inout6
    486 	aes${dir}	$rndkey0,$inout7
    487 	$movkey		($key),$rndkey0
    488 	jnz		.L${dir}_loop8
    489 
    490 	aes${dir}	$rndkey1,$inout0
    491 	aes${dir}	$rndkey1,$inout1
    492 	aes${dir}	$rndkey1,$inout2
    493 	aes${dir}	$rndkey1,$inout3
    494 	aes${dir}	$rndkey1,$inout4
    495 	aes${dir}	$rndkey1,$inout5
    496 	aes${dir}	$rndkey1,$inout6
    497 	aes${dir}	$rndkey1,$inout7
    498 	aes${dir}last	$rndkey0,$inout0
    499 	aes${dir}last	$rndkey0,$inout1
    500 	aes${dir}last	$rndkey0,$inout2
    501 	aes${dir}last	$rndkey0,$inout3
    502 	aes${dir}last	$rndkey0,$inout4
    503 	aes${dir}last	$rndkey0,$inout5
    504 	aes${dir}last	$rndkey0,$inout6
    505 	aes${dir}last	$rndkey0,$inout7
    506 	ret
    507 .size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
    508 ___
    509 }
    510 &aesni_generate3("enc") if ($PREFIX eq "aesni");
    511 &aesni_generate3("dec");
    512 &aesni_generate4("enc") if ($PREFIX eq "aesni");
    513 &aesni_generate4("dec");
    514 &aesni_generate6("enc") if ($PREFIX eq "aesni");
    515 &aesni_generate6("dec");
    516 &aesni_generate8("enc") if ($PREFIX eq "aesni");
    517 &aesni_generate8("dec");
    518 
    520 if ($PREFIX eq "aesni") {
    521 ########################################################################
    522 # void aesni_ecb_encrypt (const void *in, void *out,
    523 #			  size_t length, const AES_KEY *key,
    524 #			  int enc);
    525 $code.=<<___;
    526 .globl	aesni_ecb_encrypt
    527 .type	aesni_ecb_encrypt,\@function,5
    528 .align	16
    529 aesni_ecb_encrypt:
    530 	and	\$-16,$len
    531 	jz	.Lecb_ret
    532 
    533 	mov	240($key),$rounds	# key->rounds
    534 	$movkey	($key),$rndkey0
    535 	mov	$key,$key_		# backup $key
    536 	mov	$rounds,$rnds_		# backup $rounds
    537 	test	%r8d,%r8d		# 5th argument
    538 	jz	.Lecb_decrypt
    539 #--------------------------- ECB ENCRYPT ------------------------------#
    540 	cmp	\$0x80,$len
    541 	jb	.Lecb_enc_tail
    542 
    543 	movdqu	($inp),$inout0
    544 	movdqu	0x10($inp),$inout1
    545 	movdqu	0x20($inp),$inout2
    546 	movdqu	0x30($inp),$inout3
    547 	movdqu	0x40($inp),$inout4
    548 	movdqu	0x50($inp),$inout5
    549 	movdqu	0x60($inp),$inout6
    550 	movdqu	0x70($inp),$inout7
    551 	lea	0x80($inp),$inp
    552 	sub	\$0x80,$len
    553 	jmp	.Lecb_enc_loop8_enter
    554 .align 16
    555 .Lecb_enc_loop8:
    556 	movups	$inout0,($out)
    557 	mov	$key_,$key		# restore $key
    558 	movdqu	($inp),$inout0
    559 	mov	$rnds_,$rounds		# restore $rounds
    560 	movups	$inout1,0x10($out)
    561 	movdqu	0x10($inp),$inout1
    562 	movups	$inout2,0x20($out)
    563 	movdqu	0x20($inp),$inout2
    564 	movups	$inout3,0x30($out)
    565 	movdqu	0x30($inp),$inout3
    566 	movups	$inout4,0x40($out)
    567 	movdqu	0x40($inp),$inout4
    568 	movups	$inout5,0x50($out)
    569 	movdqu	0x50($inp),$inout5
    570 	movups	$inout6,0x60($out)
    571 	movdqu	0x60($inp),$inout6
    572 	movups	$inout7,0x70($out)
    573 	lea	0x80($out),$out
    574 	movdqu	0x70($inp),$inout7
    575 	lea	0x80($inp),$inp
    576 .Lecb_enc_loop8_enter:
    577 
    578 	call	_aesni_encrypt8
    579 
    580 	sub	\$0x80,$len
    581 	jnc	.Lecb_enc_loop8
    582 
    583 	movups	$inout0,($out)
    584 	mov	$key_,$key		# restore $key
    585 	movups	$inout1,0x10($out)
    586 	mov	$rnds_,$rounds		# restore $rounds
    587 	movups	$inout2,0x20($out)
    588 	movups	$inout3,0x30($out)
    589 	movups	$inout4,0x40($out)
    590 	movups	$inout5,0x50($out)
    591 	movups	$inout6,0x60($out)
    592 	movups	$inout7,0x70($out)
    593 	lea	0x80($out),$out
    594 	add	\$0x80,$len
    595 	jz	.Lecb_ret
    596 
    597 .Lecb_enc_tail:
    598 	movups	($inp),$inout0
    599 	cmp	\$0x20,$len
    600 	jb	.Lecb_enc_one
    601 	movups	0x10($inp),$inout1
    602 	je	.Lecb_enc_two
    603 	movups	0x20($inp),$inout2
    604 	cmp	\$0x40,$len
    605 	jb	.Lecb_enc_three
    606 	movups	0x30($inp),$inout3
    607 	je	.Lecb_enc_four
    608 	movups	0x40($inp),$inout4
    609 	cmp	\$0x60,$len
    610 	jb	.Lecb_enc_five
    611 	movups	0x50($inp),$inout5
    612 	je	.Lecb_enc_six
    613 	movdqu	0x60($inp),$inout6
    614 	call	_aesni_encrypt8
    615 	movups	$inout0,($out)
    616 	movups	$inout1,0x10($out)
    617 	movups	$inout2,0x20($out)
    618 	movups	$inout3,0x30($out)
    619 	movups	$inout4,0x40($out)
    620 	movups	$inout5,0x50($out)
    621 	movups	$inout6,0x60($out)
    622 	jmp	.Lecb_ret
    623 .align	16
    624 .Lecb_enc_one:
    625 ___
    626 	&aesni_generate1("enc",$key,$rounds);
    627 $code.=<<___;
    628 	movups	$inout0,($out)
    629 	jmp	.Lecb_ret
    630 .align	16
    631 .Lecb_enc_two:
    632 	xorps	$inout2,$inout2
    633 	call	_aesni_encrypt3
    634 	movups	$inout0,($out)
    635 	movups	$inout1,0x10($out)
    636 	jmp	.Lecb_ret
    637 .align	16
    638 .Lecb_enc_three:
    639 	call	_aesni_encrypt3
    640 	movups	$inout0,($out)
    641 	movups	$inout1,0x10($out)
    642 	movups	$inout2,0x20($out)
    643 	jmp	.Lecb_ret
    644 .align	16
    645 .Lecb_enc_four:
    646 	call	_aesni_encrypt4
    647 	movups	$inout0,($out)
    648 	movups	$inout1,0x10($out)
    649 	movups	$inout2,0x20($out)
    650 	movups	$inout3,0x30($out)
    651 	jmp	.Lecb_ret
    652 .align	16
    653 .Lecb_enc_five:
    654 	xorps	$inout5,$inout5
    655 	call	_aesni_encrypt6
    656 	movups	$inout0,($out)
    657 	movups	$inout1,0x10($out)
    658 	movups	$inout2,0x20($out)
    659 	movups	$inout3,0x30($out)
    660 	movups	$inout4,0x40($out)
    661 	jmp	.Lecb_ret
    662 .align	16
    663 .Lecb_enc_six:
    664 	call	_aesni_encrypt6
    665 	movups	$inout0,($out)
    666 	movups	$inout1,0x10($out)
    667 	movups	$inout2,0x20($out)
    668 	movups	$inout3,0x30($out)
    669 	movups	$inout4,0x40($out)
    670 	movups	$inout5,0x50($out)
    671 	jmp	.Lecb_ret
    672 #--------------------------- ECB DECRYPT ------------------------------#
    674 .align	16
    675 .Lecb_decrypt:
    676 	cmp	\$0x80,$len
    677 	jb	.Lecb_dec_tail
    678 
    679 	movdqu	($inp),$inout0
    680 	movdqu	0x10($inp),$inout1
    681 	movdqu	0x20($inp),$inout2
    682 	movdqu	0x30($inp),$inout3
    683 	movdqu	0x40($inp),$inout4
    684 	movdqu	0x50($inp),$inout5
    685 	movdqu	0x60($inp),$inout6
    686 	movdqu	0x70($inp),$inout7
    687 	lea	0x80($inp),$inp
    688 	sub	\$0x80,$len
    689 	jmp	.Lecb_dec_loop8_enter
    690 .align 16
    691 .Lecb_dec_loop8:
    692 	movups	$inout0,($out)
    693 	mov	$key_,$key		# restore $key
    694 	movdqu	($inp),$inout0
    695 	mov	$rnds_,$rounds		# restore $rounds
    696 	movups	$inout1,0x10($out)
    697 	movdqu	0x10($inp),$inout1
    698 	movups	$inout2,0x20($out)
    699 	movdqu	0x20($inp),$inout2
    700 	movups	$inout3,0x30($out)
    701 	movdqu	0x30($inp),$inout3
    702 	movups	$inout4,0x40($out)
    703 	movdqu	0x40($inp),$inout4
    704 	movups	$inout5,0x50($out)
    705 	movdqu	0x50($inp),$inout5
    706 	movups	$inout6,0x60($out)
    707 	movdqu	0x60($inp),$inout6
    708 	movups	$inout7,0x70($out)
    709 	lea	0x80($out),$out
    710 	movdqu	0x70($inp),$inout7
    711 	lea	0x80($inp),$inp
    712 .Lecb_dec_loop8_enter:
    713 
    714 	call	_aesni_decrypt8
    715 
    716 	$movkey	($key_),$rndkey0
    717 	sub	\$0x80,$len
    718 	jnc	.Lecb_dec_loop8
    719 
    720 	movups	$inout0,($out)
    721 	mov	$key_,$key		# restore $key
    722 	movups	$inout1,0x10($out)
    723 	mov	$rnds_,$rounds		# restore $rounds
    724 	movups	$inout2,0x20($out)
    725 	movups	$inout3,0x30($out)
    726 	movups	$inout4,0x40($out)
    727 	movups	$inout5,0x50($out)
    728 	movups	$inout6,0x60($out)
    729 	movups	$inout7,0x70($out)
    730 	lea	0x80($out),$out
    731 	add	\$0x80,$len
    732 	jz	.Lecb_ret
    733 
    734 .Lecb_dec_tail:
    735 	movups	($inp),$inout0
    736 	cmp	\$0x20,$len
    737 	jb	.Lecb_dec_one
    738 	movups	0x10($inp),$inout1
    739 	je	.Lecb_dec_two
    740 	movups	0x20($inp),$inout2
    741 	cmp	\$0x40,$len
    742 	jb	.Lecb_dec_three
    743 	movups	0x30($inp),$inout3
    744 	je	.Lecb_dec_four
    745 	movups	0x40($inp),$inout4
    746 	cmp	\$0x60,$len
    747 	jb	.Lecb_dec_five
    748 	movups	0x50($inp),$inout5
    749 	je	.Lecb_dec_six
    750 	movups	0x60($inp),$inout6
    751 	$movkey	($key),$rndkey0
    752 	call	_aesni_decrypt8
    753 	movups	$inout0,($out)
    754 	movups	$inout1,0x10($out)
    755 	movups	$inout2,0x20($out)
    756 	movups	$inout3,0x30($out)
    757 	movups	$inout4,0x40($out)
    758 	movups	$inout5,0x50($out)
    759 	movups	$inout6,0x60($out)
    760 	jmp	.Lecb_ret
    761 .align	16
    762 .Lecb_dec_one:
    763 ___
    764 	&aesni_generate1("dec",$key,$rounds);
    765 $code.=<<___;
    766 	movups	$inout0,($out)
    767 	jmp	.Lecb_ret
    768 .align	16
    769 .Lecb_dec_two:
    770 	xorps	$inout2,$inout2
    771 	call	_aesni_decrypt3
    772 	movups	$inout0,($out)
    773 	movups	$inout1,0x10($out)
    774 	jmp	.Lecb_ret
    775 .align	16
    776 .Lecb_dec_three:
    777 	call	_aesni_decrypt3
    778 	movups	$inout0,($out)
    779 	movups	$inout1,0x10($out)
    780 	movups	$inout2,0x20($out)
    781 	jmp	.Lecb_ret
    782 .align	16
    783 .Lecb_dec_four:
    784 	call	_aesni_decrypt4
    785 	movups	$inout0,($out)
    786 	movups	$inout1,0x10($out)
    787 	movups	$inout2,0x20($out)
    788 	movups	$inout3,0x30($out)
    789 	jmp	.Lecb_ret
    790 .align	16
    791 .Lecb_dec_five:
    792 	xorps	$inout5,$inout5
    793 	call	_aesni_decrypt6
    794 	movups	$inout0,($out)
    795 	movups	$inout1,0x10($out)
    796 	movups	$inout2,0x20($out)
    797 	movups	$inout3,0x30($out)
    798 	movups	$inout4,0x40($out)
    799 	jmp	.Lecb_ret
    800 .align	16
    801 .Lecb_dec_six:
    802 	call	_aesni_decrypt6
    803 	movups	$inout0,($out)
    804 	movups	$inout1,0x10($out)
    805 	movups	$inout2,0x20($out)
    806 	movups	$inout3,0x30($out)
    807 	movups	$inout4,0x40($out)
    808 	movups	$inout5,0x50($out)
    809 
    810 .Lecb_ret:
    811 	ret
    812 .size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
    813 ___
    814 
    816 {
    817 ######################################################################
    818 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
    819 #                         size_t blocks, const AES_KEY *key,
    820 #                         const char *ivec,char *cmac);
    821 #
    822 # Handles only complete blocks, operates on 64-bit counter and
    823 # does not update *ivec! Nor does it finalize CMAC value
    824 # (see engine/eng_aesni.c for details)
    825 #
    826 {
    827 my $cmac="%r9";	# 6th argument
    828 
    829 my $increment="%xmm6";
    830 my $bswap_mask="%xmm7";
    831 
    832 $code.=<<___;
    833 .globl	aesni_ccm64_encrypt_blocks
    834 .type	aesni_ccm64_encrypt_blocks,\@function,6
    835 .align	16
    836 aesni_ccm64_encrypt_blocks:
    837 ___
    838 $code.=<<___ if ($win64);
    839 	lea	-0x58(%rsp),%rsp
    840 	movaps	%xmm6,(%rsp)
    841 	movaps	%xmm7,0x10(%rsp)
    842 	movaps	%xmm8,0x20(%rsp)
    843 	movaps	%xmm9,0x30(%rsp)
    844 .Lccm64_enc_body:
    845 ___
    846 $code.=<<___;
    847 	mov	240($key),$rounds		# key->rounds
    848 	movdqu	($ivp),$iv
    849 	movdqa	.Lincrement64(%rip),$increment
    850 	movdqa	.Lbswap_mask(%rip),$bswap_mask
    851 
    852 	shr	\$1,$rounds
    853 	lea	0($key),$key_
    854 	movdqu	($cmac),$inout1
    855 	movdqa	$iv,$inout0
    856 	mov	$rounds,$rnds_
    857 	pshufb	$bswap_mask,$iv
    858 	jmp	.Lccm64_enc_outer
    859 .align	16
    860 .Lccm64_enc_outer:
    861 	$movkey	($key_),$rndkey0
    862 	mov	$rnds_,$rounds
    863 	movups	($inp),$in0			# load inp
    864 
    865 	xorps	$rndkey0,$inout0		# counter
    866 	$movkey	16($key_),$rndkey1
    867 	xorps	$in0,$rndkey0
    868 	lea	32($key_),$key
    869 	xorps	$rndkey0,$inout1		# cmac^=inp
    870 	$movkey	($key),$rndkey0
    871 
    872 .Lccm64_enc2_loop:
    873 	aesenc	$rndkey1,$inout0
    874 	dec	$rounds
    875 	aesenc	$rndkey1,$inout1
    876 	$movkey	16($key),$rndkey1
    877 	aesenc	$rndkey0,$inout0
    878 	lea	32($key),$key
    879 	aesenc	$rndkey0,$inout1
    880 	$movkey	0($key),$rndkey0
    881 	jnz	.Lccm64_enc2_loop
    882 	aesenc	$rndkey1,$inout0
    883 	aesenc	$rndkey1,$inout1
    884 	paddq	$increment,$iv
    885 	aesenclast	$rndkey0,$inout0
    886 	aesenclast	$rndkey0,$inout1
    887 
    888 	dec	$len
    889 	lea	16($inp),$inp
    890 	xorps	$inout0,$in0			# inp ^= E(iv)
    891 	movdqa	$iv,$inout0
    892 	movups	$in0,($out)			# save output
    893 	lea	16($out),$out
    894 	pshufb	$bswap_mask,$inout0
    895 	jnz	.Lccm64_enc_outer
    896 
    897 	movups	$inout1,($cmac)
    898 ___
    899 $code.=<<___ if ($win64);
    900 	movaps	(%rsp),%xmm6
    901 	movaps	0x10(%rsp),%xmm7
    902 	movaps	0x20(%rsp),%xmm8
    903 	movaps	0x30(%rsp),%xmm9
    904 	lea	0x58(%rsp),%rsp
    905 .Lccm64_enc_ret:
    906 ___
    907 $code.=<<___;
    908 	ret
    909 .size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
    910 ___
    911 ######################################################################
    912 $code.=<<___;
    913 .globl	aesni_ccm64_decrypt_blocks
    914 .type	aesni_ccm64_decrypt_blocks,\@function,6
    915 .align	16
    916 aesni_ccm64_decrypt_blocks:
    917 ___
    918 $code.=<<___ if ($win64);
    919 	lea	-0x58(%rsp),%rsp
    920 	movaps	%xmm6,(%rsp)
    921 	movaps	%xmm7,0x10(%rsp)
    922 	movaps	%xmm8,0x20(%rsp)
    923 	movaps	%xmm9,0x30(%rsp)
    924 .Lccm64_dec_body:
    925 ___
    926 $code.=<<___;
    927 	mov	240($key),$rounds		# key->rounds
    928 	movups	($ivp),$iv
    929 	movdqu	($cmac),$inout1
    930 	movdqa	.Lincrement64(%rip),$increment
    931 	movdqa	.Lbswap_mask(%rip),$bswap_mask
    932 
    933 	movaps	$iv,$inout0
    934 	mov	$rounds,$rnds_
    935 	mov	$key,$key_
    936 	pshufb	$bswap_mask,$iv
    937 ___
    938 	&aesni_generate1("enc",$key,$rounds);
    939 $code.=<<___;
    940 	movups	($inp),$in0			# load inp
    941 	paddq	$increment,$iv
    942 	lea	16($inp),$inp
    943 	jmp	.Lccm64_dec_outer
    944 .align	16
    945 .Lccm64_dec_outer:
    946 	xorps	$inout0,$in0			# inp ^= E(iv)
    947 	movdqa	$iv,$inout0
    948 	mov	$rnds_,$rounds
    949 	movups	$in0,($out)			# save output
    950 	lea	16($out),$out
    951 	pshufb	$bswap_mask,$inout0
    952 
    953 	sub	\$1,$len
    954 	jz	.Lccm64_dec_break
    955 
    956 	$movkey	($key_),$rndkey0
    957 	shr	\$1,$rounds
    958 	$movkey	16($key_),$rndkey1
    959 	xorps	$rndkey0,$in0
    960 	lea	32($key_),$key
    961 	xorps	$rndkey0,$inout0
    962 	xorps	$in0,$inout1			# cmac^=out
    963 	$movkey	($key),$rndkey0
    964 
    965 .Lccm64_dec2_loop:
    966 	aesenc	$rndkey1,$inout0
    967 	dec	$rounds
    968 	aesenc	$rndkey1,$inout1
    969 	$movkey	16($key),$rndkey1
    970 	aesenc	$rndkey0,$inout0
    971 	lea	32($key),$key
    972 	aesenc	$rndkey0,$inout1
    973 	$movkey	0($key),$rndkey0
    974 	jnz	.Lccm64_dec2_loop
    975 	movups	($inp),$in0			# load inp
    976 	paddq	$increment,$iv
    977 	aesenc	$rndkey1,$inout0
    978 	aesenc	$rndkey1,$inout1
    979 	lea	16($inp),$inp
    980 	aesenclast	$rndkey0,$inout0
    981 	aesenclast	$rndkey0,$inout1
    982 	jmp	.Lccm64_dec_outer
    983 
    984 .align	16
    985 .Lccm64_dec_break:
    986 	#xorps	$in0,$inout1			# cmac^=out
    987 ___
    988 	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
    989 $code.=<<___;
    990 	movups	$inout1,($cmac)
    991 ___
    992 $code.=<<___ if ($win64);
    993 	movaps	(%rsp),%xmm6
    994 	movaps	0x10(%rsp),%xmm7
    995 	movaps	0x20(%rsp),%xmm8
    996 	movaps	0x30(%rsp),%xmm9
    997 	lea	0x58(%rsp),%rsp
    998 .Lccm64_dec_ret:
    999 ___
   1000 $code.=<<___;
   1001 	ret
   1002 .size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
   1003 ___
   1004 }
   1006 ######################################################################
   1007 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
   1008 #                         size_t blocks, const AES_KEY *key,
   1009 #                         const char *ivec);
   1010 #
   1011 # Handles only complete blocks, operates on 32-bit counter and
   1012 # does not update *ivec! (see engine/eng_aesni.c for details)
   1013 #
   1014 {
   1015 my $reserved = $win64?0:-0x28;
   1016 my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
   1017 my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
   1018 my $bswap_mask="%xmm15";
   1019 
   1020 $code.=<<___;
   1021 .globl	aesni_ctr32_encrypt_blocks
   1022 .type	aesni_ctr32_encrypt_blocks,\@function,5
   1023 .align	16
   1024 aesni_ctr32_encrypt_blocks:
   1025 ___
   1026 $code.=<<___ if ($win64);
   1027 	lea	-0xc8(%rsp),%rsp
   1028 	movaps	%xmm6,0x20(%rsp)
   1029 	movaps	%xmm7,0x30(%rsp)
   1030 	movaps	%xmm8,0x40(%rsp)
   1031 	movaps	%xmm9,0x50(%rsp)
   1032 	movaps	%xmm10,0x60(%rsp)
   1033 	movaps	%xmm11,0x70(%rsp)
   1034 	movaps	%xmm12,0x80(%rsp)
   1035 	movaps	%xmm13,0x90(%rsp)
   1036 	movaps	%xmm14,0xa0(%rsp)
   1037 	movaps	%xmm15,0xb0(%rsp)
   1038 .Lctr32_body:
   1039 ___
   1040 $code.=<<___;
   1041 	cmp	\$1,$len
   1042 	je	.Lctr32_one_shortcut
   1043 
   1044 	movdqu	($ivp),$ivec
   1045 	movdqa	.Lbswap_mask(%rip),$bswap_mask
   1046 	xor	$rounds,$rounds
   1047 	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
   1048 	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
   1049 
   1050 	mov	240($key),$rounds		# key->rounds
   1051 	bswap	$rnds_
   1052 	pxor	$iv0,$iv0			# vector of 3 32-bit counters
   1053 	pxor	$iv1,$iv1			# vector of 3 32-bit counters
   1054 	pinsrd	\$0,$rnds_,$iv0
   1055 	lea	3($rnds_),$key_
   1056 	pinsrd	\$0,$key_,$iv1
   1057 	inc	$rnds_
   1058 	pinsrd	\$1,$rnds_,$iv0
   1059 	inc	$key_
   1060 	pinsrd	\$1,$key_,$iv1
   1061 	inc	$rnds_
   1062 	pinsrd	\$2,$rnds_,$iv0
   1063 	inc	$key_
   1064 	pinsrd	\$2,$key_,$iv1
   1065 	movdqa	$iv0,$reserved(%rsp)
   1066 	pshufb	$bswap_mask,$iv0
   1067 	movdqa	$iv1,`$reserved+0x10`(%rsp)
   1068 	pshufb	$bswap_mask,$iv1
   1069 
   1070 	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
   1071 	pshufd	\$`2<<6`,$iv0,$inout1
   1072 	pshufd	\$`1<<6`,$iv0,$inout2
   1073 	cmp	\$6,$len
   1074 	jb	.Lctr32_tail
   1075 	shr	\$1,$rounds
   1076 	mov	$key,$key_			# backup $key
   1077 	mov	$rounds,$rnds_			# backup $rounds
   1078 	sub	\$6,$len
   1079 	jmp	.Lctr32_loop6
   1080 
   1081 .align	16
   1082 .Lctr32_loop6:
   1083 	pshufd	\$`3<<6`,$iv1,$inout3
   1084 	por	$ivec,$inout0			# merge counter-less ivec
   1085 	 $movkey	($key_),$rndkey0
   1086 	pshufd	\$`2<<6`,$iv1,$inout4
   1087 	por	$ivec,$inout1
   1088 	 $movkey	16($key_),$rndkey1
   1089 	pshufd	\$`1<<6`,$iv1,$inout5
   1090 	por	$ivec,$inout2
   1091 	por	$ivec,$inout3
   1092 	 xorps		$rndkey0,$inout0
   1093 	por	$ivec,$inout4
   1094 	por	$ivec,$inout5
   1095 
   1096 	# inline _aesni_encrypt6 and interleave last rounds
   1097 	# with own code...
   1098 
   1099 	pxor		$rndkey0,$inout1
   1100 	aesenc		$rndkey1,$inout0
   1101 	lea		32($key_),$key
   1102 	pxor		$rndkey0,$inout2
   1103 	aesenc		$rndkey1,$inout1
   1104 	 movdqa		.Lincrement32(%rip),$iv1
   1105 	pxor		$rndkey0,$inout3
   1106 	aesenc		$rndkey1,$inout2
   1107 	 movdqa		$reserved(%rsp),$iv0
   1108 	pxor		$rndkey0,$inout4
   1109 	aesenc		$rndkey1,$inout3
   1110 	pxor		$rndkey0,$inout5
   1111 	$movkey		($key),$rndkey0
   1112 	dec		$rounds
   1113 	aesenc		$rndkey1,$inout4
   1114 	aesenc		$rndkey1,$inout5
   1115 	jmp		.Lctr32_enc_loop6_enter
   1116 .align	16
   1117 .Lctr32_enc_loop6:
   1118 	aesenc		$rndkey1,$inout0
   1119 	aesenc		$rndkey1,$inout1
   1120 	dec		$rounds
   1121 	aesenc		$rndkey1,$inout2
   1122 	aesenc		$rndkey1,$inout3
   1123 	aesenc		$rndkey1,$inout4
   1124 	aesenc		$rndkey1,$inout5
   1125 .Lctr32_enc_loop6_enter:
   1126 	$movkey		16($key),$rndkey1
   1127 	aesenc		$rndkey0,$inout0
   1128 	aesenc		$rndkey0,$inout1
   1129 	lea		32($key),$key
   1130 	aesenc		$rndkey0,$inout2
   1131 	aesenc		$rndkey0,$inout3
   1132 	aesenc		$rndkey0,$inout4
   1133 	aesenc		$rndkey0,$inout5
   1134 	$movkey		($key),$rndkey0
   1135 	jnz		.Lctr32_enc_loop6
   1136 
   1137 	aesenc		$rndkey1,$inout0
   1138 	 paddd		$iv1,$iv0		# increment counter vector
   1139 	aesenc		$rndkey1,$inout1
   1140 	 paddd		`$reserved+0x10`(%rsp),$iv1
   1141 	aesenc		$rndkey1,$inout2
   1142 	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
   1143 	aesenc		$rndkey1,$inout3
   1144 	 movdqa		$iv1,`$reserved+0x10`(%rsp)
   1145 	aesenc		$rndkey1,$inout4
   1146 	 pshufb		$bswap_mask,$iv0	# byte swap
   1147 	aesenc		$rndkey1,$inout5
   1148 	 pshufb		$bswap_mask,$iv1
   1149 
   1150 	aesenclast	$rndkey0,$inout0
   1151 	 movups		($inp),$in0		# load input
   1152 	aesenclast	$rndkey0,$inout1
   1153 	 movups		0x10($inp),$in1
   1154 	aesenclast	$rndkey0,$inout2
   1155 	 movups		0x20($inp),$in2
   1156 	aesenclast	$rndkey0,$inout3
   1157 	 movups		0x30($inp),$in3
   1158 	aesenclast	$rndkey0,$inout4
   1159 	 movups		0x40($inp),$rndkey1
   1160 	aesenclast	$rndkey0,$inout5
   1161 	 movups		0x50($inp),$rndkey0
   1162 	 lea	0x60($inp),$inp
   1163 
   1164 	xorps	$inout0,$in0			# xor
   1165 	 pshufd	\$`3<<6`,$iv0,$inout0
   1166 	xorps	$inout1,$in1
   1167 	 pshufd	\$`2<<6`,$iv0,$inout1
   1168 	movups	$in0,($out)			# store output
   1169 	xorps	$inout2,$in2
   1170 	 pshufd	\$`1<<6`,$iv0,$inout2
   1171 	movups	$in1,0x10($out)
   1172 	xorps	$inout3,$in3
   1173 	movups	$in2,0x20($out)
   1174 	xorps	$inout4,$rndkey1
   1175 	movups	$in3,0x30($out)
   1176 	xorps	$inout5,$rndkey0
   1177 	movups	$rndkey1,0x40($out)
   1178 	movups	$rndkey0,0x50($out)
   1179 	lea	0x60($out),$out
   1180 	mov	$rnds_,$rounds
   1181 	sub	\$6,$len
   1182 	jnc	.Lctr32_loop6
   1183 
   1184 	add	\$6,$len
   1185 	jz	.Lctr32_done
   1186 	mov	$key_,$key			# restore $key
   1187 	lea	1($rounds,$rounds),$rounds	# restore original value
   1188 
   1189 .Lctr32_tail:
   1190 	por	$ivec,$inout0
   1191 	movups	($inp),$in0
   1192 	cmp	\$2,$len
   1193 	jb	.Lctr32_one
   1194 
   1195 	por	$ivec,$inout1
   1196 	movups	0x10($inp),$in1
   1197 	je	.Lctr32_two
   1198 
   1199 	pshufd	\$`3<<6`,$iv1,$inout3
   1200 	por	$ivec,$inout2
   1201 	movups	0x20($inp),$in2
   1202 	cmp	\$4,$len
   1203 	jb	.Lctr32_three
   1204 
   1205 	pshufd	\$`2<<6`,$iv1,$inout4
   1206 	por	$ivec,$inout3
   1207 	movups	0x30($inp),$in3
   1208 	je	.Lctr32_four
   1209 
   1210 	por	$ivec,$inout4
   1211 	xorps	$inout5,$inout5
   1212 
   1213 	call	_aesni_encrypt6
   1214 
   1215 	movups	0x40($inp),$rndkey1
   1216 	xorps	$inout0,$in0
   1217 	xorps	$inout1,$in1
   1218 	movups	$in0,($out)
   1219 	xorps	$inout2,$in2
   1220 	movups	$in1,0x10($out)
   1221 	xorps	$inout3,$in3
   1222 	movups	$in2,0x20($out)
   1223 	xorps	$inout4,$rndkey1
   1224 	movups	$in3,0x30($out)
   1225 	movups	$rndkey1,0x40($out)
   1226 	jmp	.Lctr32_done
   1227 
   1228 .align	16
   1229 .Lctr32_one_shortcut:
   1230 	movups	($ivp),$inout0
   1231 	movups	($inp),$in0
   1232 	mov	240($key),$rounds		# key->rounds
   1233 .Lctr32_one:
   1234 ___
   1235 	&aesni_generate1("enc",$key,$rounds);
   1236 $code.=<<___;
   1237 	xorps	$inout0,$in0
   1238 	movups	$in0,($out)
   1239 	jmp	.Lctr32_done
   1240 
   1241 .align	16
   1242 .Lctr32_two:
   1243 	xorps	$inout2,$inout2
   1244 	call	_aesni_encrypt3
   1245 	xorps	$inout0,$in0
   1246 	xorps	$inout1,$in1
   1247 	movups	$in0,($out)
   1248 	movups	$in1,0x10($out)
   1249 	jmp	.Lctr32_done
   1250 
   1251 .align	16
   1252 .Lctr32_three:
   1253 	call	_aesni_encrypt3
   1254 	xorps	$inout0,$in0
   1255 	xorps	$inout1,$in1
   1256 	movups	$in0,($out)
   1257 	xorps	$inout2,$in2
   1258 	movups	$in1,0x10($out)
   1259 	movups	$in2,0x20($out)
   1260 	jmp	.Lctr32_done
   1261 
   1262 .align	16
   1263 .Lctr32_four:
   1264 	call	_aesni_encrypt4
   1265 	xorps	$inout0,$in0
   1266 	xorps	$inout1,$in1
   1267 	movups	$in0,($out)
   1268 	xorps	$inout2,$in2
   1269 	movups	$in1,0x10($out)
   1270 	xorps	$inout3,$in3
   1271 	movups	$in2,0x20($out)
   1272 	movups	$in3,0x30($out)
   1273 
   1274 .Lctr32_done:
   1275 ___
   1276 $code.=<<___ if ($win64);
   1277 	movaps	0x20(%rsp),%xmm6
   1278 	movaps	0x30(%rsp),%xmm7
   1279 	movaps	0x40(%rsp),%xmm8
   1280 	movaps	0x50(%rsp),%xmm9
   1281 	movaps	0x60(%rsp),%xmm10
   1282 	movaps	0x70(%rsp),%xmm11
   1283 	movaps	0x80(%rsp),%xmm12
   1284 	movaps	0x90(%rsp),%xmm13
   1285 	movaps	0xa0(%rsp),%xmm14
   1286 	movaps	0xb0(%rsp),%xmm15
   1287 	lea	0xc8(%rsp),%rsp
   1288 .Lctr32_ret:
   1289 ___
   1290 $code.=<<___;
   1291 	ret
   1292 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
   1293 ___
   1294 }
   1295 
   1297 ######################################################################
   1298 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
   1299 #	const AES_KEY *key1, const AES_KEY *key2
   1300 #	const unsigned char iv[16]);
   1301 #
   1302 {
   1303 my @tweak=map("%xmm$_",(10..15));
   1304 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
   1305 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
   1306 my $frame_size = 0x68 + ($win64?160:0);
   1307 
   1308 $code.=<<___;
   1309 .globl	aesni_xts_encrypt
   1310 .type	aesni_xts_encrypt,\@function,6
   1311 .align	16
   1312 aesni_xts_encrypt:
   1313 	lea	-$frame_size(%rsp),%rsp
   1314 ___
   1315 $code.=<<___ if ($win64);
   1316 	movaps	%xmm6,0x60(%rsp)
   1317 	movaps	%xmm7,0x70(%rsp)
   1318 	movaps	%xmm8,0x80(%rsp)
   1319 	movaps	%xmm9,0x90(%rsp)
   1320 	movaps	%xmm10,0xa0(%rsp)
   1321 	movaps	%xmm11,0xb0(%rsp)
   1322 	movaps	%xmm12,0xc0(%rsp)
   1323 	movaps	%xmm13,0xd0(%rsp)
   1324 	movaps	%xmm14,0xe0(%rsp)
   1325 	movaps	%xmm15,0xf0(%rsp)
   1326 .Lxts_enc_body:
   1327 ___
   1328 $code.=<<___;
   1329 	movups	($ivp),@tweak[5]		# load clear-text tweak
   1330 	mov	240(%r8),$rounds		# key2->rounds
   1331 	mov	240($key),$rnds_		# key1->rounds
   1332 ___
   1333 	# generate the tweak
   1334 	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
   1335 $code.=<<___;
   1336 	mov	$key,$key_			# backup $key
   1337 	mov	$rnds_,$rounds			# backup $rounds
   1338 	mov	$len,$len_			# backup $len
   1339 	and	\$-16,$len
   1340 
   1341 	movdqa	.Lxts_magic(%rip),$twmask
   1342 	pxor	$twtmp,$twtmp
   1343 	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
   1344 ___
   1345     for ($i=0;$i<4;$i++) {
   1346     $code.=<<___;
   1347 	pshufd	\$0x13,$twtmp,$twres
   1348 	pxor	$twtmp,$twtmp
   1349 	movdqa	@tweak[5],@tweak[$i]
   1350 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1351 	pand	$twmask,$twres			# isolate carry and residue
   1352 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1353 	pxor	$twres,@tweak[5]
   1354 ___
   1355     }
   1356 $code.=<<___;
   1357 	sub	\$16*6,$len
   1358 	jc	.Lxts_enc_short
   1359 
   1360 	shr	\$1,$rounds
   1361 	sub	\$1,$rounds
   1362 	mov	$rounds,$rnds_
   1363 	jmp	.Lxts_enc_grandloop
   1364 
   1365 .align	16
   1366 .Lxts_enc_grandloop:
   1367 	pshufd	\$0x13,$twtmp,$twres
   1368 	movdqa	@tweak[5],@tweak[4]
   1369 	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
   1370 	movdqu	`16*0`($inp),$inout0		# load input
   1371 	pand	$twmask,$twres			# isolate carry and residue
   1372 	movdqu	`16*1`($inp),$inout1
   1373 	pxor	$twres,@tweak[5]
   1374 
   1375 	movdqu	`16*2`($inp),$inout2
   1376 	pxor	@tweak[0],$inout0		# input^=tweak
   1377 	movdqu	`16*3`($inp),$inout3
   1378 	pxor	@tweak[1],$inout1
   1379 	movdqu	`16*4`($inp),$inout4
   1380 	pxor	@tweak[2],$inout2
   1381 	movdqu	`16*5`($inp),$inout5
   1382 	lea	`16*6`($inp),$inp
   1383 	pxor	@tweak[3],$inout3
   1384 	$movkey		($key_),$rndkey0
   1385 	pxor	@tweak[4],$inout4
   1386 	pxor	@tweak[5],$inout5
   1387 
   1388 	# inline _aesni_encrypt6 and interleave first and last rounds
   1389 	# with own code...
   1390 	$movkey		16($key_),$rndkey1
   1391 	pxor		$rndkey0,$inout0
   1392 	pxor		$rndkey0,$inout1
   1393 	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
   1394 	aesenc		$rndkey1,$inout0
   1395 	lea		32($key_),$key
   1396 	pxor		$rndkey0,$inout2
   1397 	 movdqa	@tweak[1],`16*1`(%rsp)
   1398 	aesenc		$rndkey1,$inout1
   1399 	pxor		$rndkey0,$inout3
   1400 	 movdqa	@tweak[2],`16*2`(%rsp)
   1401 	aesenc		$rndkey1,$inout2
   1402 	pxor		$rndkey0,$inout4
   1403 	 movdqa	@tweak[3],`16*3`(%rsp)
   1404 	aesenc		$rndkey1,$inout3
   1405 	pxor		$rndkey0,$inout5
   1406 	$movkey		($key),$rndkey0
   1407 	dec		$rounds
   1408 	 movdqa	@tweak[4],`16*4`(%rsp)
   1409 	aesenc		$rndkey1,$inout4
   1410 	 movdqa	@tweak[5],`16*5`(%rsp)
   1411 	aesenc		$rndkey1,$inout5
   1412 	pxor	$twtmp,$twtmp
   1413 	pcmpgtd	@tweak[5],$twtmp
   1414 	jmp		.Lxts_enc_loop6_enter
   1415 
   1416 .align	16
   1417 .Lxts_enc_loop6:
   1418 	aesenc		$rndkey1,$inout0
   1419 	aesenc		$rndkey1,$inout1
   1420 	dec		$rounds
   1421 	aesenc		$rndkey1,$inout2
   1422 	aesenc		$rndkey1,$inout3
   1423 	aesenc		$rndkey1,$inout4
   1424 	aesenc		$rndkey1,$inout5
   1425 .Lxts_enc_loop6_enter:
   1426 	$movkey		16($key),$rndkey1
   1427 	aesenc		$rndkey0,$inout0
   1428 	aesenc		$rndkey0,$inout1
   1429 	lea		32($key),$key
   1430 	aesenc		$rndkey0,$inout2
   1431 	aesenc		$rndkey0,$inout3
   1432 	aesenc		$rndkey0,$inout4
   1433 	aesenc		$rndkey0,$inout5
   1434 	$movkey		($key),$rndkey0
   1435 	jnz		.Lxts_enc_loop6
   1436 
   1437 	pshufd	\$0x13,$twtmp,$twres
   1438 	pxor	$twtmp,$twtmp
   1439 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1440 	 aesenc		$rndkey1,$inout0
   1441 	pand	$twmask,$twres			# isolate carry and residue
   1442 	 aesenc		$rndkey1,$inout1
   1443 	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
   1444 	 aesenc		$rndkey1,$inout2
   1445 	pxor	$twres,@tweak[5]
   1446 	 aesenc		$rndkey1,$inout3
   1447 	 aesenc		$rndkey1,$inout4
   1448 	 aesenc		$rndkey1,$inout5
   1449 	 $movkey	16($key),$rndkey1
   1450 
   1451 	pshufd	\$0x13,$twtmp,$twres
   1452 	pxor	$twtmp,$twtmp
   1453 	movdqa	@tweak[5],@tweak[0]
   1454 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1455 	 aesenc		$rndkey0,$inout0
   1456 	pand	$twmask,$twres			# isolate carry and residue
   1457 	 aesenc		$rndkey0,$inout1
   1458 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1459 	 aesenc		$rndkey0,$inout2
   1460 	pxor	$twres,@tweak[5]
   1461 	 aesenc		$rndkey0,$inout3
   1462 	 aesenc		$rndkey0,$inout4
   1463 	 aesenc		$rndkey0,$inout5
   1464 	 $movkey	32($key),$rndkey0
   1465 
   1466 	pshufd	\$0x13,$twtmp,$twres
   1467 	pxor	$twtmp,$twtmp
   1468 	movdqa	@tweak[5],@tweak[1]
   1469 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1470 	 aesenc		$rndkey1,$inout0
   1471 	pand	$twmask,$twres			# isolate carry and residue
   1472 	 aesenc		$rndkey1,$inout1
   1473 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1474 	 aesenc		$rndkey1,$inout2
   1475 	pxor	$twres,@tweak[5]
   1476 	 aesenc		$rndkey1,$inout3
   1477 	 aesenc		$rndkey1,$inout4
   1478 	 aesenc		$rndkey1,$inout5
   1479 
   1480 	pshufd	\$0x13,$twtmp,$twres
   1481 	pxor	$twtmp,$twtmp
   1482 	movdqa	@tweak[5],@tweak[2]
   1483 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1484 	 aesenclast	$rndkey0,$inout0
   1485 	pand	$twmask,$twres			# isolate carry and residue
   1486 	 aesenclast	$rndkey0,$inout1
   1487 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1488 	 aesenclast	$rndkey0,$inout2
   1489 	pxor	$twres,@tweak[5]
   1490 	 aesenclast	$rndkey0,$inout3
   1491 	 aesenclast	$rndkey0,$inout4
   1492 	 aesenclast	$rndkey0,$inout5
   1493 
   1494 	pshufd	\$0x13,$twtmp,$twres
   1495 	pxor	$twtmp,$twtmp
   1496 	movdqa	@tweak[5],@tweak[3]
   1497 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1498 	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
   1499 	pand	$twmask,$twres			# isolate carry and residue
   1500 	 xorps	`16*1`(%rsp),$inout1
   1501 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1502 	pxor	$twres,@tweak[5]
   1503 
   1504 	xorps	`16*2`(%rsp),$inout2
   1505 	movups	$inout0,`16*0`($out)		# write output
   1506 	xorps	`16*3`(%rsp),$inout3
   1507 	movups	$inout1,`16*1`($out)
   1508 	xorps	`16*4`(%rsp),$inout4
   1509 	movups	$inout2,`16*2`($out)
   1510 	xorps	`16*5`(%rsp),$inout5
   1511 	movups	$inout3,`16*3`($out)
   1512 	mov	$rnds_,$rounds			# restore $rounds
   1513 	movups	$inout4,`16*4`($out)
   1514 	movups	$inout5,`16*5`($out)
   1515 	lea	`16*6`($out),$out
   1516 	sub	\$16*6,$len
   1517 	jnc	.Lxts_enc_grandloop
   1518 
   1519 	lea	3($rounds,$rounds),$rounds	# restore original value
   1520 	mov	$key_,$key			# restore $key
   1521 	mov	$rounds,$rnds_			# backup $rounds
   1522 
   1523 .Lxts_enc_short:
   1524 	add	\$16*6,$len
   1525 	jz	.Lxts_enc_done
   1526 
   1527 	cmp	\$0x20,$len
   1528 	jb	.Lxts_enc_one
   1529 	je	.Lxts_enc_two
   1530 
   1531 	cmp	\$0x40,$len
   1532 	jb	.Lxts_enc_three
   1533 	je	.Lxts_enc_four
   1534 
   1535 	pshufd	\$0x13,$twtmp,$twres
   1536 	movdqa	@tweak[5],@tweak[4]
   1537 	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
   1538 	 movdqu	($inp),$inout0
   1539 	pand	$twmask,$twres			# isolate carry and residue
   1540 	 movdqu	16*1($inp),$inout1
   1541 	pxor	$twres,@tweak[5]
   1542 
   1543 	movdqu	16*2($inp),$inout2
   1544 	pxor	@tweak[0],$inout0
   1545 	movdqu	16*3($inp),$inout3
   1546 	pxor	@tweak[1],$inout1
   1547 	movdqu	16*4($inp),$inout4
   1548 	lea	16*5($inp),$inp
   1549 	pxor	@tweak[2],$inout2
   1550 	pxor	@tweak[3],$inout3
   1551 	pxor	@tweak[4],$inout4
   1552 
   1553 	call	_aesni_encrypt6
   1554 
   1555 	xorps	@tweak[0],$inout0
   1556 	movdqa	@tweak[5],@tweak[0]
   1557 	xorps	@tweak[1],$inout1
   1558 	xorps	@tweak[2],$inout2
   1559 	movdqu	$inout0,($out)
   1560 	xorps	@tweak[3],$inout3
   1561 	movdqu	$inout1,16*1($out)
   1562 	xorps	@tweak[4],$inout4
   1563 	movdqu	$inout2,16*2($out)
   1564 	movdqu	$inout3,16*3($out)
   1565 	movdqu	$inout4,16*4($out)
   1566 	lea	16*5($out),$out
   1567 	jmp	.Lxts_enc_done
   1568 
   1569 .align	16
   1570 .Lxts_enc_one:
   1571 	movups	($inp),$inout0
   1572 	lea	16*1($inp),$inp
   1573 	xorps	@tweak[0],$inout0
   1574 ___
   1575 	&aesni_generate1("enc",$key,$rounds);
   1576 $code.=<<___;
   1577 	xorps	@tweak[0],$inout0
   1578 	movdqa	@tweak[1],@tweak[0]
   1579 	movups	$inout0,($out)
   1580 	lea	16*1($out),$out
   1581 	jmp	.Lxts_enc_done
   1582 
   1583 .align	16
   1584 .Lxts_enc_two:
   1585 	movups	($inp),$inout0
   1586 	movups	16($inp),$inout1
   1587 	lea	32($inp),$inp
   1588 	xorps	@tweak[0],$inout0
   1589 	xorps	@tweak[1],$inout1
   1590 
   1591 	call	_aesni_encrypt3
   1592 
   1593 	xorps	@tweak[0],$inout0
   1594 	movdqa	@tweak[2],@tweak[0]
   1595 	xorps	@tweak[1],$inout1
   1596 	movups	$inout0,($out)
   1597 	movups	$inout1,16*1($out)
   1598 	lea	16*2($out),$out
   1599 	jmp	.Lxts_enc_done
   1600 
   1601 .align	16
   1602 .Lxts_enc_three:
   1603 	movups	($inp),$inout0
   1604 	movups	16*1($inp),$inout1
   1605 	movups	16*2($inp),$inout2
   1606 	lea	16*3($inp),$inp
   1607 	xorps	@tweak[0],$inout0
   1608 	xorps	@tweak[1],$inout1
   1609 	xorps	@tweak[2],$inout2
   1610 
   1611 	call	_aesni_encrypt3
   1612 
   1613 	xorps	@tweak[0],$inout0
   1614 	movdqa	@tweak[3],@tweak[0]
   1615 	xorps	@tweak[1],$inout1
   1616 	xorps	@tweak[2],$inout2
   1617 	movups	$inout0,($out)
   1618 	movups	$inout1,16*1($out)
   1619 	movups	$inout2,16*2($out)
   1620 	lea	16*3($out),$out
   1621 	jmp	.Lxts_enc_done
   1622 
   1623 .align	16
   1624 .Lxts_enc_four:
   1625 	movups	($inp),$inout0
   1626 	movups	16*1($inp),$inout1
   1627 	movups	16*2($inp),$inout2
   1628 	xorps	@tweak[0],$inout0
   1629 	movups	16*3($inp),$inout3
   1630 	lea	16*4($inp),$inp
   1631 	xorps	@tweak[1],$inout1
   1632 	xorps	@tweak[2],$inout2
   1633 	xorps	@tweak[3],$inout3
   1634 
   1635 	call	_aesni_encrypt4
   1636 
   1637 	xorps	@tweak[0],$inout0
   1638 	movdqa	@tweak[5],@tweak[0]
   1639 	xorps	@tweak[1],$inout1
   1640 	xorps	@tweak[2],$inout2
   1641 	movups	$inout0,($out)
   1642 	xorps	@tweak[3],$inout3
   1643 	movups	$inout1,16*1($out)
   1644 	movups	$inout2,16*2($out)
   1645 	movups	$inout3,16*3($out)
   1646 	lea	16*4($out),$out
   1647 	jmp	.Lxts_enc_done
   1648 
   1649 .align	16
   1650 .Lxts_enc_done:
   1651 	and	\$15,$len_
   1652 	jz	.Lxts_enc_ret
   1653 	mov	$len_,$len
   1654 
   1655 .Lxts_enc_steal:
   1656 	movzb	($inp),%eax			# borrow $rounds ...
   1657 	movzb	-16($out),%ecx			# ... and $key
   1658 	lea	1($inp),$inp
   1659 	mov	%al,-16($out)
   1660 	mov	%cl,0($out)
   1661 	lea	1($out),$out
   1662 	sub	\$1,$len
   1663 	jnz	.Lxts_enc_steal
   1664 
   1665 	sub	$len_,$out			# rewind $out
   1666 	mov	$key_,$key			# restore $key
   1667 	mov	$rnds_,$rounds			# restore $rounds
   1668 
   1669 	movups	-16($out),$inout0
   1670 	xorps	@tweak[0],$inout0
   1671 ___
   1672 	&aesni_generate1("enc",$key,$rounds);
   1673 $code.=<<___;
   1674 	xorps	@tweak[0],$inout0
   1675 	movups	$inout0,-16($out)
   1676 
   1677 .Lxts_enc_ret:
   1678 ___
   1679 $code.=<<___ if ($win64);
   1680 	movaps	0x60(%rsp),%xmm6
   1681 	movaps	0x70(%rsp),%xmm7
   1682 	movaps	0x80(%rsp),%xmm8
   1683 	movaps	0x90(%rsp),%xmm9
   1684 	movaps	0xa0(%rsp),%xmm10
   1685 	movaps	0xb0(%rsp),%xmm11
   1686 	movaps	0xc0(%rsp),%xmm12
   1687 	movaps	0xd0(%rsp),%xmm13
   1688 	movaps	0xe0(%rsp),%xmm14
   1689 	movaps	0xf0(%rsp),%xmm15
   1690 ___
   1691 $code.=<<___;
   1692 	lea	$frame_size(%rsp),%rsp
   1693 .Lxts_enc_epilogue:
   1694 	ret
   1695 .size	aesni_xts_encrypt,.-aesni_xts_encrypt
   1696 ___
   1697 
   1698 $code.=<<___;
   1699 .globl	aesni_xts_decrypt
   1700 .type	aesni_xts_decrypt,\@function,6
   1701 .align	16
   1702 aesni_xts_decrypt:
   1703 	lea	-$frame_size(%rsp),%rsp
   1704 ___
   1705 $code.=<<___ if ($win64);
   1706 	movaps	%xmm6,0x60(%rsp)
   1707 	movaps	%xmm7,0x70(%rsp)
   1708 	movaps	%xmm8,0x80(%rsp)
   1709 	movaps	%xmm9,0x90(%rsp)
   1710 	movaps	%xmm10,0xa0(%rsp)
   1711 	movaps	%xmm11,0xb0(%rsp)
   1712 	movaps	%xmm12,0xc0(%rsp)
   1713 	movaps	%xmm13,0xd0(%rsp)
   1714 	movaps	%xmm14,0xe0(%rsp)
   1715 	movaps	%xmm15,0xf0(%rsp)
   1716 .Lxts_dec_body:
   1717 ___
   1718 $code.=<<___;
   1719 	movups	($ivp),@tweak[5]		# load clear-text tweak
   1720 	mov	240($key2),$rounds		# key2->rounds
   1721 	mov	240($key),$rnds_		# key1->rounds
   1722 ___
   1723 	# generate the tweak
   1724 	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
   1725 $code.=<<___;
   1726 	xor	%eax,%eax			# if ($len%16) len-=16;
   1727 	test	\$15,$len
   1728 	setnz	%al
   1729 	shl	\$4,%rax
   1730 	sub	%rax,$len
   1731 
   1732 	mov	$key,$key_			# backup $key
   1733 	mov	$rnds_,$rounds			# backup $rounds
   1734 	mov	$len,$len_			# backup $len
   1735 	and	\$-16,$len
   1736 
   1737 	movdqa	.Lxts_magic(%rip),$twmask
   1738 	pxor	$twtmp,$twtmp
   1739 	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
   1740 ___
   1741     for ($i=0;$i<4;$i++) {
   1742     $code.=<<___;
   1743 	pshufd	\$0x13,$twtmp,$twres
   1744 	pxor	$twtmp,$twtmp
   1745 	movdqa	@tweak[5],@tweak[$i]
   1746 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1747 	pand	$twmask,$twres			# isolate carry and residue
   1748 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1749 	pxor	$twres,@tweak[5]
   1750 ___
   1751     }
   1752 $code.=<<___;
   1753 	sub	\$16*6,$len
   1754 	jc	.Lxts_dec_short
   1755 
   1756 	shr	\$1,$rounds
   1757 	sub	\$1,$rounds
   1758 	mov	$rounds,$rnds_
   1759 	jmp	.Lxts_dec_grandloop
   1760 
   1761 .align	16
   1762 .Lxts_dec_grandloop:
   1763 	pshufd	\$0x13,$twtmp,$twres
   1764 	movdqa	@tweak[5],@tweak[4]
   1765 	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
   1766 	movdqu	`16*0`($inp),$inout0		# load input
   1767 	pand	$twmask,$twres			# isolate carry and residue
   1768 	movdqu	`16*1`($inp),$inout1
   1769 	pxor	$twres,@tweak[5]
   1770 
   1771 	movdqu	`16*2`($inp),$inout2
   1772 	pxor	@tweak[0],$inout0		# input^=tweak
   1773 	movdqu	`16*3`($inp),$inout3
   1774 	pxor	@tweak[1],$inout1
   1775 	movdqu	`16*4`($inp),$inout4
   1776 	pxor	@tweak[2],$inout2
   1777 	movdqu	`16*5`($inp),$inout5
   1778 	lea	`16*6`($inp),$inp
   1779 	pxor	@tweak[3],$inout3
   1780 	$movkey		($key_),$rndkey0
   1781 	pxor	@tweak[4],$inout4
   1782 	pxor	@tweak[5],$inout5
   1783 
   1784 	# inline _aesni_decrypt6 and interleave first and last rounds
   1785 	# with own code...
   1786 	$movkey		16($key_),$rndkey1
   1787 	pxor		$rndkey0,$inout0
   1788 	pxor		$rndkey0,$inout1
   1789 	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
   1790 	aesdec		$rndkey1,$inout0
   1791 	lea		32($key_),$key
   1792 	pxor		$rndkey0,$inout2
   1793 	 movdqa	@tweak[1],`16*1`(%rsp)
   1794 	aesdec		$rndkey1,$inout1
   1795 	pxor		$rndkey0,$inout3
   1796 	 movdqa	@tweak[2],`16*2`(%rsp)
   1797 	aesdec		$rndkey1,$inout2
   1798 	pxor		$rndkey0,$inout4
   1799 	 movdqa	@tweak[3],`16*3`(%rsp)
   1800 	aesdec		$rndkey1,$inout3
   1801 	pxor		$rndkey0,$inout5
   1802 	$movkey		($key),$rndkey0
   1803 	dec		$rounds
   1804 	 movdqa	@tweak[4],`16*4`(%rsp)
   1805 	aesdec		$rndkey1,$inout4
   1806 	 movdqa	@tweak[5],`16*5`(%rsp)
   1807 	aesdec		$rndkey1,$inout5
   1808 	pxor	$twtmp,$twtmp
   1809 	pcmpgtd	@tweak[5],$twtmp
   1810 	jmp		.Lxts_dec_loop6_enter
   1811 
   1812 .align	16
   1813 .Lxts_dec_loop6:
   1814 	aesdec		$rndkey1,$inout0
   1815 	aesdec		$rndkey1,$inout1
   1816 	dec		$rounds
   1817 	aesdec		$rndkey1,$inout2
   1818 	aesdec		$rndkey1,$inout3
   1819 	aesdec		$rndkey1,$inout4
   1820 	aesdec		$rndkey1,$inout5
   1821 .Lxts_dec_loop6_enter:
   1822 	$movkey		16($key),$rndkey1
   1823 	aesdec		$rndkey0,$inout0
   1824 	aesdec		$rndkey0,$inout1
   1825 	lea		32($key),$key
   1826 	aesdec		$rndkey0,$inout2
   1827 	aesdec		$rndkey0,$inout3
   1828 	aesdec		$rndkey0,$inout4
   1829 	aesdec		$rndkey0,$inout5
   1830 	$movkey		($key),$rndkey0
   1831 	jnz		.Lxts_dec_loop6
   1832 
   1833 	pshufd	\$0x13,$twtmp,$twres
   1834 	pxor	$twtmp,$twtmp
   1835 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1836 	 aesdec		$rndkey1,$inout0
   1837 	pand	$twmask,$twres			# isolate carry and residue
   1838 	 aesdec		$rndkey1,$inout1
   1839 	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
   1840 	 aesdec		$rndkey1,$inout2
   1841 	pxor	$twres,@tweak[5]
   1842 	 aesdec		$rndkey1,$inout3
   1843 	 aesdec		$rndkey1,$inout4
   1844 	 aesdec		$rndkey1,$inout5
   1845 	 $movkey	16($key),$rndkey1
   1846 
   1847 	pshufd	\$0x13,$twtmp,$twres
   1848 	pxor	$twtmp,$twtmp
   1849 	movdqa	@tweak[5],@tweak[0]
   1850 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1851 	 aesdec		$rndkey0,$inout0
   1852 	pand	$twmask,$twres			# isolate carry and residue
   1853 	 aesdec		$rndkey0,$inout1
   1854 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1855 	 aesdec		$rndkey0,$inout2
   1856 	pxor	$twres,@tweak[5]
   1857 	 aesdec		$rndkey0,$inout3
   1858 	 aesdec		$rndkey0,$inout4
   1859 	 aesdec		$rndkey0,$inout5
   1860 	 $movkey	32($key),$rndkey0
   1861 
   1862 	pshufd	\$0x13,$twtmp,$twres
   1863 	pxor	$twtmp,$twtmp
   1864 	movdqa	@tweak[5],@tweak[1]
   1865 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1866 	 aesdec		$rndkey1,$inout0
   1867 	pand	$twmask,$twres			# isolate carry and residue
   1868 	 aesdec		$rndkey1,$inout1
   1869 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1870 	 aesdec		$rndkey1,$inout2
   1871 	pxor	$twres,@tweak[5]
   1872 	 aesdec		$rndkey1,$inout3
   1873 	 aesdec		$rndkey1,$inout4
   1874 	 aesdec		$rndkey1,$inout5
   1875 
   1876 	pshufd	\$0x13,$twtmp,$twres
   1877 	pxor	$twtmp,$twtmp
   1878 	movdqa	@tweak[5],@tweak[2]
   1879 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1880 	 aesdeclast	$rndkey0,$inout0
   1881 	pand	$twmask,$twres			# isolate carry and residue
   1882 	 aesdeclast	$rndkey0,$inout1
   1883 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1884 	 aesdeclast	$rndkey0,$inout2
   1885 	pxor	$twres,@tweak[5]
   1886 	 aesdeclast	$rndkey0,$inout3
   1887 	 aesdeclast	$rndkey0,$inout4
   1888 	 aesdeclast	$rndkey0,$inout5
   1889 
   1890 	pshufd	\$0x13,$twtmp,$twres
   1891 	pxor	$twtmp,$twtmp
   1892 	movdqa	@tweak[5],@tweak[3]
   1893 	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
   1894 	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
   1895 	pand	$twmask,$twres			# isolate carry and residue
   1896 	 xorps	`16*1`(%rsp),$inout1
   1897 	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
   1898 	pxor	$twres,@tweak[5]
   1899 
   1900 	xorps	`16*2`(%rsp),$inout2
   1901 	movups	$inout0,`16*0`($out)		# write output
   1902 	xorps	`16*3`(%rsp),$inout3
   1903 	movups	$inout1,`16*1`($out)
   1904 	xorps	`16*4`(%rsp),$inout4
   1905 	movups	$inout2,`16*2`($out)
   1906 	xorps	`16*5`(%rsp),$inout5
   1907 	movups	$inout3,`16*3`($out)
   1908 	mov	$rnds_,$rounds			# restore $rounds
   1909 	movups	$inout4,`16*4`($out)
   1910 	movups	$inout5,`16*5`($out)
   1911 	lea	`16*6`($out),$out
   1912 	sub	\$16*6,$len
   1913 	jnc	.Lxts_dec_grandloop
   1914 
   1915 	lea	3($rounds,$rounds),$rounds	# restore original value
   1916 	mov	$key_,$key			# restore $key
   1917 	mov	$rounds,$rnds_			# backup $rounds
   1918 
   1919 .Lxts_dec_short:
   1920 	add	\$16*6,$len
   1921 	jz	.Lxts_dec_done
   1922 
   1923 	cmp	\$0x20,$len
   1924 	jb	.Lxts_dec_one
   1925 	je	.Lxts_dec_two
   1926 
   1927 	cmp	\$0x40,$len
   1928 	jb	.Lxts_dec_three
   1929 	je	.Lxts_dec_four
   1930 
   1931 	pshufd	\$0x13,$twtmp,$twres
   1932 	movdqa	@tweak[5],@tweak[4]
   1933 	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
   1934 	 movdqu	($inp),$inout0
   1935 	pand	$twmask,$twres			# isolate carry and residue
   1936 	 movdqu	16*1($inp),$inout1
   1937 	pxor	$twres,@tweak[5]
   1938 
   1939 	movdqu	16*2($inp),$inout2
   1940 	pxor	@tweak[0],$inout0
   1941 	movdqu	16*3($inp),$inout3
   1942 	pxor	@tweak[1],$inout1
   1943 	movdqu	16*4($inp),$inout4
   1944 	lea	16*5($inp),$inp
   1945 	pxor	@tweak[2],$inout2
   1946 	pxor	@tweak[3],$inout3
   1947 	pxor	@tweak[4],$inout4
   1948 
   1949 	call	_aesni_decrypt6
   1950 
   1951 	xorps	@tweak[0],$inout0
   1952 	xorps	@tweak[1],$inout1
   1953 	xorps	@tweak[2],$inout2
   1954 	movdqu	$inout0,($out)
   1955 	xorps	@tweak[3],$inout3
   1956 	movdqu	$inout1,16*1($out)
   1957 	xorps	@tweak[4],$inout4
   1958 	movdqu	$inout2,16*2($out)
   1959 	 pxor		$twtmp,$twtmp
   1960 	movdqu	$inout3,16*3($out)
   1961 	 pcmpgtd	@tweak[5],$twtmp
   1962 	movdqu	$inout4,16*4($out)
   1963 	lea	16*5($out),$out
   1964 	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
   1965 	and	\$15,$len_
   1966 	jz	.Lxts_dec_ret
   1967 
   1968 	movdqa	@tweak[5],@tweak[0]
   1969 	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
   1970 	pand	$twmask,@tweak[1]		# isolate carry and residue
   1971 	pxor	@tweak[5],@tweak[1]
   1972 	jmp	.Lxts_dec_done2
   1973 
   1974 .align	16
   1975 .Lxts_dec_one:
   1976 	movups	($inp),$inout0
   1977 	lea	16*1($inp),$inp
   1978 	xorps	@tweak[0],$inout0
   1979 ___
   1980 	&aesni_generate1("dec",$key,$rounds);
   1981 $code.=<<___;
   1982 	xorps	@tweak[0],$inout0
   1983 	movdqa	@tweak[1],@tweak[0]
   1984 	movups	$inout0,($out)
   1985 	movdqa	@tweak[2],@tweak[1]
   1986 	lea	16*1($out),$out
   1987 	jmp	.Lxts_dec_done
   1988 
   1989 .align	16
   1990 .Lxts_dec_two:
   1991 	movups	($inp),$inout0
   1992 	movups	16($inp),$inout1
   1993 	lea	32($inp),$inp
   1994 	xorps	@tweak[0],$inout0
   1995 	xorps	@tweak[1],$inout1
   1996 
   1997 	call	_aesni_decrypt3
   1998 
   1999 	xorps	@tweak[0],$inout0
   2000 	movdqa	@tweak[2],@tweak[0]
   2001 	xorps	@tweak[1],$inout1
   2002 	movdqa	@tweak[3],@tweak[1]
   2003 	movups	$inout0,($out)
   2004 	movups	$inout1,16*1($out)
   2005 	lea	16*2($out),$out
   2006 	jmp	.Lxts_dec_done
   2007 
   2008 .align	16
   2009 .Lxts_dec_three:
   2010 	movups	($inp),$inout0
   2011 	movups	16*1($inp),$inout1
   2012 	movups	16*2($inp),$inout2
   2013 	lea	16*3($inp),$inp
   2014 	xorps	@tweak[0],$inout0
   2015 	xorps	@tweak[1],$inout1
   2016 	xorps	@tweak[2],$inout2
   2017 
   2018 	call	_aesni_decrypt3
   2019 
   2020 	xorps	@tweak[0],$inout0
   2021 	movdqa	@tweak[3],@tweak[0]
   2022 	xorps	@tweak[1],$inout1
   2023 	movdqa	@tweak[5],@tweak[1]
   2024 	xorps	@tweak[2],$inout2
   2025 	movups	$inout0,($out)
   2026 	movups	$inout1,16*1($out)
   2027 	movups	$inout2,16*2($out)
   2028 	lea	16*3($out),$out
   2029 	jmp	.Lxts_dec_done
   2030 
   2031 .align	16
   2032 .Lxts_dec_four:
   2033 	pshufd	\$0x13,$twtmp,$twres
   2034 	movdqa	@tweak[5],@tweak[4]
   2035 	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
   2036 	 movups	($inp),$inout0
   2037 	pand	$twmask,$twres			# isolate carry and residue
   2038 	 movups	16*1($inp),$inout1
   2039 	pxor	$twres,@tweak[5]
   2040 
   2041 	movups	16*2($inp),$inout2
   2042 	xorps	@tweak[0],$inout0
   2043 	movups	16*3($inp),$inout3
   2044 	lea	16*4($inp),$inp
   2045 	xorps	@tweak[1],$inout1
   2046 	xorps	@tweak[2],$inout2
   2047 	xorps	@tweak[3],$inout3
   2048 
   2049 	call	_aesni_decrypt4
   2050 
   2051 	xorps	@tweak[0],$inout0
   2052 	movdqa	@tweak[4],@tweak[0]
   2053 	xorps	@tweak[1],$inout1
   2054 	movdqa	@tweak[5],@tweak[1]
   2055 	xorps	@tweak[2],$inout2
   2056 	movups	$inout0,($out)
   2057 	xorps	@tweak[3],$inout3
   2058 	movups	$inout1,16*1($out)
   2059 	movups	$inout2,16*2($out)
   2060 	movups	$inout3,16*3($out)
   2061 	lea	16*4($out),$out
   2062 	jmp	.Lxts_dec_done
   2063 
   2064 .align	16
   2065 .Lxts_dec_done:
   2066 	and	\$15,$len_
   2067 	jz	.Lxts_dec_ret
   2068 .Lxts_dec_done2:
   2069 	mov	$len_,$len
   2070 	mov	$key_,$key			# restore $key
   2071 	mov	$rnds_,$rounds			# restore $rounds
   2072 
   2073 	movups	($inp),$inout0
   2074 	xorps	@tweak[1],$inout0
   2075 ___
   2076 	&aesni_generate1("dec",$key,$rounds);
   2077 $code.=<<___;
   2078 	xorps	@tweak[1],$inout0
   2079 	movups	$inout0,($out)
   2080 
   2081 .Lxts_dec_steal:
   2082 	movzb	16($inp),%eax			# borrow $rounds ...
   2083 	movzb	($out),%ecx			# ... and $key
   2084 	lea	1($inp),$inp
   2085 	mov	%al,($out)
   2086 	mov	%cl,16($out)
   2087 	lea	1($out),$out
   2088 	sub	\$1,$len
   2089 	jnz	.Lxts_dec_steal
   2090 
   2091 	sub	$len_,$out			# rewind $out
   2092 	mov	$key_,$key			# restore $key
   2093 	mov	$rnds_,$rounds			# restore $rounds
   2094 
   2095 	movups	($out),$inout0
   2096 	xorps	@tweak[0],$inout0
   2097 ___
   2098 	&aesni_generate1("dec",$key,$rounds);
   2099 $code.=<<___;
   2100 	xorps	@tweak[0],$inout0
   2101 	movups	$inout0,($out)
   2102 
   2103 .Lxts_dec_ret:
   2104 ___
   2105 $code.=<<___ if ($win64);
   2106 	movaps	0x60(%rsp),%xmm6
   2107 	movaps	0x70(%rsp),%xmm7
   2108 	movaps	0x80(%rsp),%xmm8
   2109 	movaps	0x90(%rsp),%xmm9
   2110 	movaps	0xa0(%rsp),%xmm10
   2111 	movaps	0xb0(%rsp),%xmm11
   2112 	movaps	0xc0(%rsp),%xmm12
   2113 	movaps	0xd0(%rsp),%xmm13
   2114 	movaps	0xe0(%rsp),%xmm14
   2115 	movaps	0xf0(%rsp),%xmm15
   2116 ___
   2117 $code.=<<___;
   2118 	lea	$frame_size(%rsp),%rsp
   2119 .Lxts_dec_epilogue:
   2120 	ret
   2121 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
   2122 ___
   2123 } }}
   2124 
   2126 ########################################################################
   2127 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
   2128 #			    size_t length, const AES_KEY *key,
   2129 #			    unsigned char *ivp,const int enc);
   2130 {
   2131 my $reserved = $win64?0x40:-0x18;	# used in decrypt
   2132 $code.=<<___;
   2133 .globl	${PREFIX}_cbc_encrypt
   2134 .type	${PREFIX}_cbc_encrypt,\@function,6
   2135 .align	16
   2136 ${PREFIX}_cbc_encrypt:
   2137 	test	$len,$len		# check length
   2138 	jz	.Lcbc_ret
   2139 
   2140 	mov	240($key),$rnds_	# key->rounds
   2141 	mov	$key,$key_		# backup $key
   2142 	test	%r9d,%r9d		# 6th argument
   2143 	jz	.Lcbc_decrypt
   2144 #--------------------------- CBC ENCRYPT ------------------------------#
   2145 	movups	($ivp),$inout0		# load iv as initial state
   2146 	mov	$rnds_,$rounds
   2147 	cmp	\$16,$len
   2148 	jb	.Lcbc_enc_tail
   2149 	sub	\$16,$len
   2150 	jmp	.Lcbc_enc_loop
   2151 .align	16
   2152 .Lcbc_enc_loop:
   2153 	movups	($inp),$inout1		# load input
   2154 	lea	16($inp),$inp
   2155 	#xorps	$inout1,$inout0
   2156 ___
   2157 	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
   2158 $code.=<<___;
   2159 	mov	$rnds_,$rounds		# restore $rounds
   2160 	mov	$key_,$key		# restore $key
   2161 	movups	$inout0,0($out)		# store output
   2162 	lea	16($out),$out
   2163 	sub	\$16,$len
   2164 	jnc	.Lcbc_enc_loop
   2165 	add	\$16,$len
   2166 	jnz	.Lcbc_enc_tail
   2167 	movups	$inout0,($ivp)
   2168 	jmp	.Lcbc_ret
   2169 
   2170 .Lcbc_enc_tail:
   2171 	mov	$len,%rcx	# zaps $key
   2172 	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
   2173 	.long	0x9066A4F3	# rep movsb
   2174 	mov	\$16,%ecx	# zero tail
   2175 	sub	$len,%rcx
   2176 	xor	%eax,%eax
   2177 	.long	0x9066AAF3	# rep stosb
   2178 	lea	-16(%rdi),%rdi	# rewind $out by 1 block
   2179 	mov	$rnds_,$rounds	# restore $rounds
   2180 	mov	%rdi,%rsi	# $inp and $out are the same
   2181 	mov	$key_,$key	# restore $key
   2182 	xor	$len,$len	# len=16
   2183 	jmp	.Lcbc_enc_loop	# one more spin
   2184 #--------------------------- CBC DECRYPT ------------------------------#
   2186 .align	16
   2187 .Lcbc_decrypt:
   2188 ___
   2189 $code.=<<___ if ($win64);
   2190 	lea	-0x58(%rsp),%rsp
   2191 	movaps	%xmm6,(%rsp)
   2192 	movaps	%xmm7,0x10(%rsp)
   2193 	movaps	%xmm8,0x20(%rsp)
   2194 	movaps	%xmm9,0x30(%rsp)
   2195 .Lcbc_decrypt_body:
   2196 ___
   2197 $code.=<<___;
   2198 	movups	($ivp),$iv
   2199 	mov	$rnds_,$rounds
   2200 	cmp	\$0x70,$len
   2201 	jbe	.Lcbc_dec_tail
   2202 	shr	\$1,$rnds_
   2203 	sub	\$0x70,$len
   2204 	mov	$rnds_,$rounds
   2205 	movaps	$iv,$reserved(%rsp)
   2206 	jmp	.Lcbc_dec_loop8_enter
   2207 .align	16
   2208 .Lcbc_dec_loop8:
   2209 	movaps	$rndkey0,$reserved(%rsp)	# save IV
   2210 	movups	$inout7,($out)
   2211 	lea	0x10($out),$out
   2212 .Lcbc_dec_loop8_enter:
   2213 	$movkey		($key),$rndkey0
   2214 	movups	($inp),$inout0			# load input
   2215 	movups	0x10($inp),$inout1
   2216 	$movkey		16($key),$rndkey1
   2217 
   2218 	lea		32($key),$key
   2219 	movdqu	0x20($inp),$inout2
   2220 	xorps		$rndkey0,$inout0
   2221 	movdqu	0x30($inp),$inout3
   2222 	xorps		$rndkey0,$inout1
   2223 	movdqu	0x40($inp),$inout4
   2224 	aesdec		$rndkey1,$inout0
   2225 	pxor		$rndkey0,$inout2
   2226 	movdqu	0x50($inp),$inout5
   2227 	aesdec		$rndkey1,$inout1
   2228 	pxor		$rndkey0,$inout3
   2229 	movdqu	0x60($inp),$inout6
   2230 	aesdec		$rndkey1,$inout2
   2231 	pxor		$rndkey0,$inout4
   2232 	movdqu	0x70($inp),$inout7
   2233 	aesdec		$rndkey1,$inout3
   2234 	pxor		$rndkey0,$inout5
   2235 	dec		$rounds
   2236 	aesdec		$rndkey1,$inout4
   2237 	pxor		$rndkey0,$inout6
   2238 	aesdec		$rndkey1,$inout5
   2239 	pxor		$rndkey0,$inout7
   2240 	$movkey		($key),$rndkey0
   2241 	aesdec		$rndkey1,$inout6
   2242 	aesdec		$rndkey1,$inout7
   2243 	$movkey		16($key),$rndkey1
   2244 
   2245 	call		.Ldec_loop8_enter
   2246 
   2247 	movups	($inp),$rndkey1		# re-load input
   2248 	movups	0x10($inp),$rndkey0
   2249 	xorps	$reserved(%rsp),$inout0	# ^= IV
   2250 	xorps	$rndkey1,$inout1
   2251 	movups	0x20($inp),$rndkey1
   2252 	xorps	$rndkey0,$inout2
   2253 	movups	0x30($inp),$rndkey0
   2254 	xorps	$rndkey1,$inout3
   2255 	movups	0x40($inp),$rndkey1
   2256 	xorps	$rndkey0,$inout4
   2257 	movups	0x50($inp),$rndkey0
   2258 	xorps	$rndkey1,$inout5
   2259 	movups	0x60($inp),$rndkey1
   2260 	xorps	$rndkey0,$inout6
   2261 	movups	0x70($inp),$rndkey0	# IV
   2262 	xorps	$rndkey1,$inout7
   2263 	movups	$inout0,($out)
   2264 	movups	$inout1,0x10($out)
   2265 	movups	$inout2,0x20($out)
   2266 	movups	$inout3,0x30($out)
   2267 	mov	$rnds_,$rounds		# restore $rounds
   2268 	movups	$inout4,0x40($out)
   2269 	mov	$key_,$key		# restore $key
   2270 	movups	$inout5,0x50($out)
   2271 	lea	0x80($inp),$inp
   2272 	movups	$inout6,0x60($out)
   2273 	lea	0x70($out),$out
   2274 	sub	\$0x80,$len
   2275 	ja	.Lcbc_dec_loop8
   2276 
   2277 	movaps	$inout7,$inout0
   2278 	movaps	$rndkey0,$iv
   2279 	add	\$0x70,$len
   2280 	jle	.Lcbc_dec_tail_collected
   2281 	movups	$inout0,($out)
   2282 	lea	1($rnds_,$rnds_),$rounds
   2283 	lea	0x10($out),$out
   2284 .Lcbc_dec_tail:
   2285 	movups	($inp),$inout0
   2286 	movaps	$inout0,$in0
   2287 	cmp	\$0x10,$len
   2288 	jbe	.Lcbc_dec_one
   2289 
   2290 	movups	0x10($inp),$inout1
   2291 	movaps	$inout1,$in1
   2292 	cmp	\$0x20,$len
   2293 	jbe	.Lcbc_dec_two
   2294 
   2295 	movups	0x20($inp),$inout2
   2296 	movaps	$inout2,$in2
   2297 	cmp	\$0x30,$len
   2298 	jbe	.Lcbc_dec_three
   2299 
   2300 	movups	0x30($inp),$inout3
   2301 	cmp	\$0x40,$len
   2302 	jbe	.Lcbc_dec_four
   2303 
   2304 	movups	0x40($inp),$inout4
   2305 	cmp	\$0x50,$len
   2306 	jbe	.Lcbc_dec_five
   2307 
   2308 	movups	0x50($inp),$inout5
   2309 	cmp	\$0x60,$len
   2310 	jbe	.Lcbc_dec_six
   2311 
   2312 	movups	0x60($inp),$inout6
   2313 	movaps	$iv,$reserved(%rsp)	# save IV
   2314 	call	_aesni_decrypt8
   2315 	movups	($inp),$rndkey1
   2316 	movups	0x10($inp),$rndkey0
   2317 	xorps	$reserved(%rsp),$inout0	# ^= IV
   2318 	xorps	$rndkey1,$inout1
   2319 	movups	0x20($inp),$rndkey1
   2320 	xorps	$rndkey0,$inout2
   2321 	movups	0x30($inp),$rndkey0
   2322 	xorps	$rndkey1,$inout3
   2323 	movups	0x40($inp),$rndkey1
   2324 	xorps	$rndkey0,$inout4
   2325 	movups	0x50($inp),$rndkey0
   2326 	xorps	$rndkey1,$inout5
   2327 	movups	0x60($inp),$iv		# IV
   2328 	xorps	$rndkey0,$inout6
   2329 	movups	$inout0,($out)
   2330 	movups	$inout1,0x10($out)
   2331 	movups	$inout2,0x20($out)
   2332 	movups	$inout3,0x30($out)
   2333 	movups	$inout4,0x40($out)
   2334 	movups	$inout5,0x50($out)
   2335 	lea	0x60($out),$out
   2336 	movaps	$inout6,$inout0
   2337 	sub	\$0x70,$len
   2338 	jmp	.Lcbc_dec_tail_collected
   2339 .align	16
   2340 .Lcbc_dec_one:
   2341 ___
   2342 	&aesni_generate1("dec",$key,$rounds);
   2343 $code.=<<___;
   2344 	xorps	$iv,$inout0
   2345 	movaps	$in0,$iv
   2346 	sub	\$0x10,$len
   2347 	jmp	.Lcbc_dec_tail_collected
   2348 .align	16
   2349 .Lcbc_dec_two:
   2350 	xorps	$inout2,$inout2
   2351 	call	_aesni_decrypt3
   2352 	xorps	$iv,$inout0
   2353 	xorps	$in0,$inout1
   2354 	movups	$inout0,($out)
   2355 	movaps	$in1,$iv
   2356 	movaps	$inout1,$inout0
   2357 	lea	0x10($out),$out
   2358 	sub	\$0x20,$len
   2359 	jmp	.Lcbc_dec_tail_collected
   2360 .align	16
   2361 .Lcbc_dec_three:
   2362 	call	_aesni_decrypt3
   2363 	xorps	$iv,$inout0
   2364 	xorps	$in0,$inout1
   2365 	movups	$inout0,($out)
   2366 	xorps	$in1,$inout2
   2367 	movups	$inout1,0x10($out)
   2368 	movaps	$in2,$iv
   2369 	movaps	$inout2,$inout0
   2370 	lea	0x20($out),$out
   2371 	sub	\$0x30,$len
   2372 	jmp	.Lcbc_dec_tail_collected
   2373 .align	16
   2374 .Lcbc_dec_four:
   2375 	call	_aesni_decrypt4
   2376 	xorps	$iv,$inout0
   2377 	movups	0x30($inp),$iv
   2378 	xorps	$in0,$inout1
   2379 	movups	$inout0,($out)
   2380 	xorps	$in1,$inout2
   2381 	movups	$inout1,0x10($out)
   2382 	xorps	$in2,$inout3
   2383 	movups	$inout2,0x20($out)
   2384 	movaps	$inout3,$inout0
   2385 	lea	0x30($out),$out
   2386 	sub	\$0x40,$len
   2387 	jmp	.Lcbc_dec_tail_collected
   2388 .align	16
   2389 .Lcbc_dec_five:
   2390 	xorps	$inout5,$inout5
   2391 	call	_aesni_decrypt6
   2392 	movups	0x10($inp),$rndkey1
   2393 	movups	0x20($inp),$rndkey0
   2394 	xorps	$iv,$inout0
   2395 	xorps	$in0,$inout1
   2396 	xorps	$rndkey1,$inout2
   2397 	movups	0x30($inp),$rndkey1
   2398 	xorps	$rndkey0,$inout3
   2399 	movups	0x40($inp),$iv
   2400 	xorps	$rndkey1,$inout4
   2401 	movups	$inout0,($out)
   2402 	movups	$inout1,0x10($out)
   2403 	movups	$inout2,0x20($out)
   2404 	movups	$inout3,0x30($out)
   2405 	lea	0x40($out),$out
   2406 	movaps	$inout4,$inout0
   2407 	sub	\$0x50,$len
   2408 	jmp	.Lcbc_dec_tail_collected
   2409 .align	16
   2410 .Lcbc_dec_six:
   2411 	call	_aesni_decrypt6
   2412 	movups	0x10($inp),$rndkey1
   2413 	movups	0x20($inp),$rndkey0
   2414 	xorps	$iv,$inout0
   2415 	xorps	$in0,$inout1
   2416 	xorps	$rndkey1,$inout2
   2417 	movups	0x30($inp),$rndkey1
   2418 	xorps	$rndkey0,$inout3
   2419 	movups	0x40($inp),$rndkey0
   2420 	xorps	$rndkey1,$inout4
   2421 	movups	0x50($inp),$iv
   2422 	xorps	$rndkey0,$inout5
   2423 	movups	$inout0,($out)
   2424 	movups	$inout1,0x10($out)
   2425 	movups	$inout2,0x20($out)
   2426 	movups	$inout3,0x30($out)
   2427 	movups	$inout4,0x40($out)
   2428 	lea	0x50($out),$out
   2429 	movaps	$inout5,$inout0
   2430 	sub	\$0x60,$len
   2431 	jmp	.Lcbc_dec_tail_collected
   2432 .align	16
   2433 .Lcbc_dec_tail_collected:
   2434 	and	\$15,$len
   2435 	movups	$iv,($ivp)
   2436 	jnz	.Lcbc_dec_tail_partial
   2437 	movups	$inout0,($out)
   2438 	jmp	.Lcbc_dec_ret
   2439 .align	16
   2440 .Lcbc_dec_tail_partial:
   2441 	movaps	$inout0,$reserved(%rsp)
   2442 	mov	\$16,%rcx
   2443 	mov	$out,%rdi
   2444 	sub	$len,%rcx
   2445 	lea	$reserved(%rsp),%rsi
   2446 	.long	0x9066A4F3	# rep movsb
   2447 
   2448 .Lcbc_dec_ret:
   2449 ___
   2450 $code.=<<___ if ($win64);
   2451 	movaps	(%rsp),%xmm6
   2452 	movaps	0x10(%rsp),%xmm7
   2453 	movaps	0x20(%rsp),%xmm8
   2454 	movaps	0x30(%rsp),%xmm9
   2455 	lea	0x58(%rsp),%rsp
   2456 ___
   2457 $code.=<<___;
   2458 .Lcbc_ret:
   2459 	ret
   2460 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
   2461 ___
   2462 } 
   2464 # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
   2465 #				int bits, AES_KEY *key)
   2466 { my ($inp,$bits,$key) = @_4args;
   2467   $bits =~ s/%r/%e/;
   2468 
   2469 $code.=<<___;
   2470 .globl	${PREFIX}_set_decrypt_key
   2471 .type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
   2472 .align	16
   2473 ${PREFIX}_set_decrypt_key:
   2474 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
   2475 	call	__aesni_set_encrypt_key
   2476 	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
   2477 	test	%eax,%eax
   2478 	jnz	.Ldec_key_ret
   2479 	lea	16($key,$bits),$inp	# points at the end of key schedule
   2480 
   2481 	$movkey	($key),%xmm0		# just swap
   2482 	$movkey	($inp),%xmm1
   2483 	$movkey	%xmm0,($inp)
   2484 	$movkey	%xmm1,($key)
   2485 	lea	16($key),$key
   2486 	lea	-16($inp),$inp
   2487 
   2488 .Ldec_key_inverse:
   2489 	$movkey	($key),%xmm0		# swap and inverse
   2490 	$movkey	($inp),%xmm1
   2491 	aesimc	%xmm0,%xmm0
   2492 	aesimc	%xmm1,%xmm1
   2493 	lea	16($key),$key
   2494 	lea	-16($inp),$inp
   2495 	$movkey	%xmm0,16($inp)
   2496 	$movkey	%xmm1,-16($key)
   2497 	cmp	$key,$inp
   2498 	ja	.Ldec_key_inverse
   2499 
   2500 	$movkey	($key),%xmm0		# inverse middle
   2501 	aesimc	%xmm0,%xmm0
   2502 	$movkey	%xmm0,($inp)
   2503 .Ldec_key_ret:
   2504 	add	\$8,%rsp
   2505 	ret
   2506 .LSEH_end_set_decrypt_key:
   2507 .size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
   2508 ___
   2509 
   2511 # This is based on submission by
   2512 #
   2513 #	Huang Ying <ying.huang (at] intel.com>
   2514 #	Vinodh Gopal <vinodh.gopal (at] intel.com>
   2515 #	Kahraman Akdemir
   2516 #
   2517 # Agressively optimized in respect to aeskeygenassist's critical path
   2518 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
   2519 #
   2520 $code.=<<___;
   2521 .globl	${PREFIX}_set_encrypt_key
   2522 .type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
   2523 .align	16
   2524 ${PREFIX}_set_encrypt_key:
   2525 __aesni_set_encrypt_key:
   2526 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
   2527 	mov	\$-1,%rax
   2528 	test	$inp,$inp
   2529 	jz	.Lenc_key_ret
   2530 	test	$key,$key
   2531 	jz	.Lenc_key_ret
   2532 
   2533 	movups	($inp),%xmm0		# pull first 128 bits of *userKey
   2534 	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
   2535 	lea	16($key),%rax
   2536 	cmp	\$256,$bits
   2537 	je	.L14rounds
   2538 	cmp	\$192,$bits
   2539 	je	.L12rounds
   2540 	cmp	\$128,$bits
   2541 	jne	.Lbad_keybits
   2542 
   2543 .L10rounds:
   2544 	mov	\$9,$bits			# 10 rounds for 128-bit key
   2545 	$movkey	%xmm0,($key)			# round 0
   2546 	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
   2547 	call		.Lkey_expansion_128_cold
   2548 	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
   2549 	call		.Lkey_expansion_128
   2550 	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
   2551 	call		.Lkey_expansion_128
   2552 	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
   2553 	call		.Lkey_expansion_128
   2554 	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
   2555 	call		.Lkey_expansion_128
   2556 	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
   2557 	call		.Lkey_expansion_128
   2558 	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
   2559 	call		.Lkey_expansion_128
   2560 	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
   2561 	call		.Lkey_expansion_128
   2562 	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
   2563 	call		.Lkey_expansion_128
   2564 	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
   2565 	call		.Lkey_expansion_128
   2566 	$movkey	%xmm0,(%rax)
   2567 	mov	$bits,80(%rax)	# 240(%rdx)
   2568 	xor	%eax,%eax
   2569 	jmp	.Lenc_key_ret
   2570 
   2571 .align	16
   2572 .L12rounds:
   2573 	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
   2574 	mov	\$11,$bits			# 12 rounds for 192
   2575 	$movkey	%xmm0,($key)			# round 0
   2576 	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
   2577 	call		.Lkey_expansion_192a_cold
   2578 	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
   2579 	call		.Lkey_expansion_192b
   2580 	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
   2581 	call		.Lkey_expansion_192a
   2582 	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
   2583 	call		.Lkey_expansion_192b
   2584 	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
   2585 	call		.Lkey_expansion_192a
   2586 	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
   2587 	call		.Lkey_expansion_192b
   2588 	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
   2589 	call		.Lkey_expansion_192a
   2590 	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
   2591 	call		.Lkey_expansion_192b
   2592 	$movkey	%xmm0,(%rax)
   2593 	mov	$bits,48(%rax)	# 240(%rdx)
   2594 	xor	%rax, %rax
   2595 	jmp	.Lenc_key_ret
   2596 
   2597 .align	16
   2598 .L14rounds:
   2599 	movups	16($inp),%xmm2			# remaning half of *userKey
   2600 	mov	\$13,$bits			# 14 rounds for 256
   2601 	lea	16(%rax),%rax
   2602 	$movkey	%xmm0,($key)			# round 0
   2603 	$movkey	%xmm2,16($key)			# round 1
   2604 	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
   2605 	call		.Lkey_expansion_256a_cold
   2606 	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
   2607 	call		.Lkey_expansion_256b
   2608 	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
   2609 	call		.Lkey_expansion_256a
   2610 	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
   2611 	call		.Lkey_expansion_256b
   2612 	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
   2613 	call		.Lkey_expansion_256a
   2614 	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
   2615 	call		.Lkey_expansion_256b
   2616 	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
   2617 	call		.Lkey_expansion_256a
   2618 	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
   2619 	call		.Lkey_expansion_256b
   2620 	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
   2621 	call		.Lkey_expansion_256a
   2622 	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
   2623 	call		.Lkey_expansion_256b
   2624 	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
   2625 	call		.Lkey_expansion_256a
   2626 	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
   2627 	call		.Lkey_expansion_256b
   2628 	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
   2629 	call		.Lkey_expansion_256a
   2630 	$movkey	%xmm0,(%rax)
   2631 	mov	$bits,16(%rax)	# 240(%rdx)
   2632 	xor	%rax,%rax
   2633 	jmp	.Lenc_key_ret
   2634 
   2635 .align	16
   2636 .Lbad_keybits:
   2637 	mov	\$-2,%rax
   2638 .Lenc_key_ret:
   2639 	add	\$8,%rsp
   2640 	ret
   2641 .LSEH_end_set_encrypt_key:
   2642 
   2644 .align	16
   2645 .Lkey_expansion_128:
   2646 	$movkey	%xmm0,(%rax)
   2647 	lea	16(%rax),%rax
   2648 .Lkey_expansion_128_cold:
   2649 	shufps	\$0b00010000,%xmm0,%xmm4
   2650 	xorps	%xmm4, %xmm0
   2651 	shufps	\$0b10001100,%xmm0,%xmm4
   2652 	xorps	%xmm4, %xmm0
   2653 	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
   2654 	xorps	%xmm1,%xmm0
   2655 	ret
   2656 
   2657 .align 16
   2658 .Lkey_expansion_192a:
   2659 	$movkey	%xmm0,(%rax)
   2660 	lea	16(%rax),%rax
   2661 .Lkey_expansion_192a_cold:
   2662 	movaps	%xmm2, %xmm5
   2663 .Lkey_expansion_192b_warm:
   2664 	shufps	\$0b00010000,%xmm0,%xmm4
   2665 	movdqa	%xmm2,%xmm3
   2666 	xorps	%xmm4,%xmm0
   2667 	shufps	\$0b10001100,%xmm0,%xmm4
   2668 	pslldq	\$4,%xmm3
   2669 	xorps	%xmm4,%xmm0
   2670 	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
   2671 	pxor	%xmm3,%xmm2
   2672 	pxor	%xmm1,%xmm0
   2673 	pshufd	\$0b11111111,%xmm0,%xmm3
   2674 	pxor	%xmm3,%xmm2
   2675 	ret
   2676 
   2677 .align 16
   2678 .Lkey_expansion_192b:
   2679 	movaps	%xmm0,%xmm3
   2680 	shufps	\$0b01000100,%xmm0,%xmm5
   2681 	$movkey	%xmm5,(%rax)
   2682 	shufps	\$0b01001110,%xmm2,%xmm3
   2683 	$movkey	%xmm3,16(%rax)
   2684 	lea	32(%rax),%rax
   2685 	jmp	.Lkey_expansion_192b_warm
   2686 
   2687 .align	16
   2688 .Lkey_expansion_256a:
   2689 	$movkey	%xmm2,(%rax)
   2690 	lea	16(%rax),%rax
   2691 .Lkey_expansion_256a_cold:
   2692 	shufps	\$0b00010000,%xmm0,%xmm4
   2693 	xorps	%xmm4,%xmm0
   2694 	shufps	\$0b10001100,%xmm0,%xmm4
   2695 	xorps	%xmm4,%xmm0
   2696 	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
   2697 	xorps	%xmm1,%xmm0
   2698 	ret
   2699 
   2700 .align 16
   2701 .Lkey_expansion_256b:
   2702 	$movkey	%xmm0,(%rax)
   2703 	lea	16(%rax),%rax
   2704 
   2705 	shufps	\$0b00010000,%xmm2,%xmm4
   2706 	xorps	%xmm4,%xmm2
   2707 	shufps	\$0b10001100,%xmm2,%xmm4
   2708 	xorps	%xmm4,%xmm2
   2709 	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
   2710 	xorps	%xmm1,%xmm2
   2711 	ret
   2712 .size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
   2713 .size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
   2714 ___
   2715 }
   2716 
   2718 $code.=<<___;
   2719 .align	64
   2720 .Lbswap_mask:
   2721 	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
   2722 .Lincrement32:
   2723 	.long	6,6,6,0
   2724 .Lincrement64:
   2725 	.long	1,0,0,0
   2726 .Lxts_magic:
   2727 	.long	0x87,0,1,0
   2728 
   2729 .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
   2730 .align	64
   2731 ___
   2732 
   2733 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2734 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2735 if ($win64) {
   2736 $rec="%rcx";
   2737 $frame="%rdx";
   2738 $context="%r8";
   2739 $disp="%r9";
   2740 
   2741 $code.=<<___;
   2742 .extern	__imp_RtlVirtualUnwind
   2743 ___
   2744 $code.=<<___ if ($PREFIX eq "aesni");
   2745 .type	ecb_se_handler,\@abi-omnipotent
   2746 .align	16
   2747 ecb_se_handler:
   2748 	push	%rsi
   2749 	push	%rdi
   2750 	push	%rbx
   2751 	push	%rbp
   2752 	push	%r12
   2753 	push	%r13
   2754 	push	%r14
   2755 	push	%r15
   2756 	pushfq
   2757 	sub	\$64,%rsp
   2758 
   2759 	mov	152($context),%rax	# pull context->Rsp
   2760 
   2761 	jmp	.Lcommon_seh_tail
   2762 .size	ecb_se_handler,.-ecb_se_handler
   2763 
   2764 .type	ccm64_se_handler,\@abi-omnipotent
   2765 .align	16
   2766 ccm64_se_handler:
   2767 	push	%rsi
   2768 	push	%rdi
   2769 	push	%rbx
   2770 	push	%rbp
   2771 	push	%r12
   2772 	push	%r13
   2773 	push	%r14
   2774 	push	%r15
   2775 	pushfq
   2776 	sub	\$64,%rsp
   2777 
   2778 	mov	120($context),%rax	# pull context->Rax
   2779 	mov	248($context),%rbx	# pull context->Rip
   2780 
   2781 	mov	8($disp),%rsi		# disp->ImageBase
   2782 	mov	56($disp),%r11		# disp->HandlerData
   2783 
   2784 	mov	0(%r11),%r10d		# HandlerData[0]
   2785 	lea	(%rsi,%r10),%r10	# prologue label
   2786 	cmp	%r10,%rbx		# context->Rip<prologue label
   2787 	jb	.Lcommon_seh_tail
   2788 
   2789 	mov	152($context),%rax	# pull context->Rsp
   2790 
   2791 	mov	4(%r11),%r10d		# HandlerData[1]
   2792 	lea	(%rsi,%r10),%r10	# epilogue label
   2793 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2794 	jae	.Lcommon_seh_tail
   2795 
   2796 	lea	0(%rax),%rsi		# %xmm save area
   2797 	lea	512($context),%rdi	# &context.Xmm6
   2798 	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
   2799 	.long	0xa548f3fc		# cld; rep movsq
   2800 	lea	0x58(%rax),%rax		# adjust stack pointer
   2801 
   2802 	jmp	.Lcommon_seh_tail
   2803 .size	ccm64_se_handler,.-ccm64_se_handler
   2804 
   2805 .type	ctr32_se_handler,\@abi-omnipotent
   2806 .align	16
   2807 ctr32_se_handler:
   2808 	push	%rsi
   2809 	push	%rdi
   2810 	push	%rbx
   2811 	push	%rbp
   2812 	push	%r12
   2813 	push	%r13
   2814 	push	%r14
   2815 	push	%r15
   2816 	pushfq
   2817 	sub	\$64,%rsp
   2818 
   2819 	mov	120($context),%rax	# pull context->Rax
   2820 	mov	248($context),%rbx	# pull context->Rip
   2821 
   2822 	lea	.Lctr32_body(%rip),%r10
   2823 	cmp	%r10,%rbx		# context->Rip<"prologue" label
   2824 	jb	.Lcommon_seh_tail
   2825 
   2826 	mov	152($context),%rax	# pull context->Rsp
   2827 
   2828 	lea	.Lctr32_ret(%rip),%r10
   2829 	cmp	%r10,%rbx
   2830 	jae	.Lcommon_seh_tail
   2831 
   2832 	lea	0x20(%rax),%rsi		# %xmm save area
   2833 	lea	512($context),%rdi	# &context.Xmm6
   2834 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
   2835 	.long	0xa548f3fc		# cld; rep movsq
   2836 	lea	0xc8(%rax),%rax		# adjust stack pointer
   2837 
   2838 	jmp	.Lcommon_seh_tail
   2839 .size	ctr32_se_handler,.-ctr32_se_handler
   2840 
   2841 .type	xts_se_handler,\@abi-omnipotent
   2842 .align	16
   2843 xts_se_handler:
   2844 	push	%rsi
   2845 	push	%rdi
   2846 	push	%rbx
   2847 	push	%rbp
   2848 	push	%r12
   2849 	push	%r13
   2850 	push	%r14
   2851 	push	%r15
   2852 	pushfq
   2853 	sub	\$64,%rsp
   2854 
   2855 	mov	120($context),%rax	# pull context->Rax
   2856 	mov	248($context),%rbx	# pull context->Rip
   2857 
   2858 	mov	8($disp),%rsi		# disp->ImageBase
   2859 	mov	56($disp),%r11		# disp->HandlerData
   2860 
   2861 	mov	0(%r11),%r10d		# HandlerData[0]
   2862 	lea	(%rsi,%r10),%r10	# prologue lable
   2863 	cmp	%r10,%rbx		# context->Rip<prologue label
   2864 	jb	.Lcommon_seh_tail
   2865 
   2866 	mov	152($context),%rax	# pull context->Rsp
   2867 
   2868 	mov	4(%r11),%r10d		# HandlerData[1]
   2869 	lea	(%rsi,%r10),%r10	# epilogue label
   2870 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2871 	jae	.Lcommon_seh_tail
   2872 
   2873 	lea	0x60(%rax),%rsi		# %xmm save area
   2874 	lea	512($context),%rdi	# & context.Xmm6
   2875 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
   2876 	.long	0xa548f3fc		# cld; rep movsq
   2877 	lea	0x68+160(%rax),%rax	# adjust stack pointer
   2878 
   2879 	jmp	.Lcommon_seh_tail
   2880 .size	xts_se_handler,.-xts_se_handler
   2881 ___
   2882 $code.=<<___;
   2883 .type	cbc_se_handler,\@abi-omnipotent
   2884 .align	16
   2885 cbc_se_handler:
   2886 	push	%rsi
   2887 	push	%rdi
   2888 	push	%rbx
   2889 	push	%rbp
   2890 	push	%r12
   2891 	push	%r13
   2892 	push	%r14
   2893 	push	%r15
   2894 	pushfq
   2895 	sub	\$64,%rsp
   2896 
   2897 	mov	152($context),%rax	# pull context->Rsp
   2898 	mov	248($context),%rbx	# pull context->Rip
   2899 
   2900 	lea	.Lcbc_decrypt(%rip),%r10
   2901 	cmp	%r10,%rbx		# context->Rip<"prologue" label
   2902 	jb	.Lcommon_seh_tail
   2903 
   2904 	lea	.Lcbc_decrypt_body(%rip),%r10
   2905 	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
   2906 	jb	.Lrestore_cbc_rax
   2907 
   2908 	lea	.Lcbc_ret(%rip),%r10
   2909 	cmp	%r10,%rbx		# context->Rip>="epilogue" label
   2910 	jae	.Lcommon_seh_tail
   2911 
   2912 	lea	0(%rax),%rsi		# top of stack
   2913 	lea	512($context),%rdi	# &context.Xmm6
   2914 	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
   2915 	.long	0xa548f3fc		# cld; rep movsq
   2916 	lea	0x58(%rax),%rax		# adjust stack pointer
   2917 	jmp	.Lcommon_seh_tail
   2918 
   2919 .Lrestore_cbc_rax:
   2920 	mov	120($context),%rax
   2921 
   2922 .Lcommon_seh_tail:
   2923 	mov	8(%rax),%rdi
   2924 	mov	16(%rax),%rsi
   2925 	mov	%rax,152($context)	# restore context->Rsp
   2926 	mov	%rsi,168($context)	# restore context->Rsi
   2927 	mov	%rdi,176($context)	# restore context->Rdi
   2928 
   2929 	mov	40($disp),%rdi		# disp->ContextRecord
   2930 	mov	$context,%rsi		# context
   2931 	mov	\$154,%ecx		# sizeof(CONTEXT)
   2932 	.long	0xa548f3fc		# cld; rep movsq
   2933 
   2934 	mov	$disp,%rsi
   2935 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2936 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2937 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2938 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2939 	mov	40(%rsi),%r10		# disp->ContextRecord
   2940 	lea	56(%rsi),%r11		# &disp->HandlerData
   2941 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2942 	mov	%r10,32(%rsp)		# arg5
   2943 	mov	%r11,40(%rsp)		# arg6
   2944 	mov	%r12,48(%rsp)		# arg7
   2945 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2946 	call	*__imp_RtlVirtualUnwind(%rip)
   2947 
   2948 	mov	\$1,%eax		# ExceptionContinueSearch
   2949 	add	\$64,%rsp
   2950 	popfq
   2951 	pop	%r15
   2952 	pop	%r14
   2953 	pop	%r13
   2954 	pop	%r12
   2955 	pop	%rbp
   2956 	pop	%rbx
   2957 	pop	%rdi
   2958 	pop	%rsi
   2959 	ret
   2960 .size	cbc_se_handler,.-cbc_se_handler
   2961 
   2962 .section	.pdata
   2963 .align	4
   2964 ___
   2965 $code.=<<___ if ($PREFIX eq "aesni");
   2966 	.rva	.LSEH_begin_aesni_ecb_encrypt
   2967 	.rva	.LSEH_end_aesni_ecb_encrypt
   2968 	.rva	.LSEH_info_ecb
   2969 
   2970 	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
   2971 	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
   2972 	.rva	.LSEH_info_ccm64_enc
   2973 
   2974 	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
   2975 	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
   2976 	.rva	.LSEH_info_ccm64_dec
   2977 
   2978 	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
   2979 	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
   2980 	.rva	.LSEH_info_ctr32
   2981 
   2982 	.rva	.LSEH_begin_aesni_xts_encrypt
   2983 	.rva	.LSEH_end_aesni_xts_encrypt
   2984 	.rva	.LSEH_info_xts_enc
   2985 
   2986 	.rva	.LSEH_begin_aesni_xts_decrypt
   2987 	.rva	.LSEH_end_aesni_xts_decrypt
   2988 	.rva	.LSEH_info_xts_dec
   2989 ___
   2990 $code.=<<___;
   2991 	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
   2992 	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
   2993 	.rva	.LSEH_info_cbc
   2994 
   2995 	.rva	${PREFIX}_set_decrypt_key
   2996 	.rva	.LSEH_end_set_decrypt_key
   2997 	.rva	.LSEH_info_key
   2998 
   2999 	.rva	${PREFIX}_set_encrypt_key
   3000 	.rva	.LSEH_end_set_encrypt_key
   3001 	.rva	.LSEH_info_key
   3002 .section	.xdata
   3003 .align	8
   3004 ___
   3005 $code.=<<___ if ($PREFIX eq "aesni");
   3006 .LSEH_info_ecb:
   3007 	.byte	9,0,0,0
   3008 	.rva	ecb_se_handler
   3009 .LSEH_info_ccm64_enc:
   3010 	.byte	9,0,0,0
   3011 	.rva	ccm64_se_handler
   3012 	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
   3013 .LSEH_info_ccm64_dec:
   3014 	.byte	9,0,0,0
   3015 	.rva	ccm64_se_handler
   3016 	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
   3017 .LSEH_info_ctr32:
   3018 	.byte	9,0,0,0
   3019 	.rva	ctr32_se_handler
   3020 .LSEH_info_xts_enc:
   3021 	.byte	9,0,0,0
   3022 	.rva	xts_se_handler
   3023 	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
   3024 .LSEH_info_xts_dec:
   3025 	.byte	9,0,0,0
   3026 	.rva	xts_se_handler
   3027 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
   3028 ___
   3029 $code.=<<___;
   3030 .LSEH_info_cbc:
   3031 	.byte	9,0,0,0
   3032 	.rva	cbc_se_handler
   3033 .LSEH_info_key:
   3034 	.byte	0x01,0x04,0x01,0x00
   3035 	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
   3036 ___
   3037 }
   3038 
   3039 sub rex {
   3040   local *opcode=shift;
   3041   my ($dst,$src)=@_;
   3042   my $rex=0;
   3043 
   3044     $rex|=0x04			if($dst>=8);
   3045     $rex|=0x01			if($src>=8);
   3046     push @opcode,$rex|0x40	if($rex);
   3047 }
   3048 
   3049 sub aesni {
   3050   my $line=shift;
   3051   my @opcode=(0x66);
   3052 
   3053     if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   3054 	rex(\@opcode,$4,$3);
   3055 	push @opcode,0x0f,0x3a,0xdf;
   3056 	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
   3057 	my $c=$2;
   3058 	push @opcode,$c=~/^0/?oct($c):$c;
   3059 	return ".byte\t".join(',',@opcode);
   3060     }
   3061     elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   3062 	my %opcodelet = (
   3063 		"aesimc" => 0xdb,
   3064 		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
   3065 		"aesdec" => 0xde,	"aesdeclast" => 0xdf
   3066 	);
   3067 	return undef if (!defined($opcodelet{$1}));
   3068 	rex(\@opcode,$3,$2);
   3069 	push @opcode,0x0f,0x38,$opcodelet{$1};
   3070 	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
   3071 	return ".byte\t".join(',',@opcode);
   3072     }
   3073     return $line;
   3074 }
   3075 
   3076 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   3077 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
   3078 
   3079 print $code;
   3080 
   3081 close STDOUT;
   3082