Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 #
     17 # This module implements support for Intel AES-NI extension. In
     18 # OpenSSL context it's used with Intel engine, but can also be used as
     19 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
     20 # details].
     21 #
     22 # Performance.
     23 #
     24 # To start with see corresponding paragraph in aesni-x86_64.pl...
     25 # Instead of filling table similar to one found there I've chosen to
     26 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
     27 # The simplified table below represents 32-bit performance relative
     28 # to 64-bit one in every given point. Ratios vary for different
     29 # encryption modes, therefore interval values.
     30 #
     31 #	16-byte     64-byte     256-byte    1-KB        8-KB
     32 #	53-67%      67-84%      91-94%      95-98%      97-99.5%
     33 #
     34 # Lower ratios for smaller block sizes are perfectly understandable,
     35 # because function call overhead is higher in 32-bit mode. Largest
     36 # 8-KB block performance is virtually same: 32-bit code is less than
     37 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
     38 
     39 # January 2011
     40 #
     41 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
     42 # interleaves at most 6 aes[enc|dec] instructions, because there are
     43 # not enough registers for 8x interleave [which should be optimal for
     44 # Sandy Bridge]. Actually, performance results for 6x interleave
     45 # factor presented in aesni-x86_64.pl (except for CTR) are for this
     46 # module.
     47 
     48 # April 2011
     49 #
     50 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
     51 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
     52 
     53 # November 2015
     54 #
     55 # Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
     56 
     57 ######################################################################
     58 # Current large-block performance in cycles per byte processed with
     59 # 128-bit key (less is better).
     60 #
     61 #		CBC en-/decrypt	CTR	XTS	ECB	OCB
     62 # Westmere	3.77/1.37	1.37	1.52	1.27
     63 # * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
     64 # Haswell	4.44/0.80	0.97	1.03	0.72	0.76
     65 # Skylake	2.68/0.65	0.65	0.66	0.64	0.66
     66 # Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
     67 # Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
     68 # Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
     69 
     70 $PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
     71 			# generates drop-in replacement for
     72 			# crypto/aes/asm/aes-586.pl:-)
     73 $AESNI_PREFIX="aes_hw";
     74 $inline=1;		# inline _aesni_[en|de]crypt
     75 
     76 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     77 push(@INC,"${dir}","${dir}../../../perlasm");
     78 require "x86asm.pl";
     79 
     80 $output = pop;
     81 open OUT,">$output";
     82 *STDOUT=*OUT;
     83 
     84 &asm_init($ARGV[0]);
     85 
     86 &external_label("OPENSSL_ia32cap_P");
     87 &preprocessor_ifndef("NDEBUG")
     88 &external_label("BORINGSSL_function_hit");
     89 &preprocessor_endif();
     90 &static_label("key_const");
     91 
     92 if ($PREFIX eq $AESNI_PREFIX)	{ $movekey=\&movups; }
     93 else			{ $movekey=\&movups; }
     94 
     95 $len="eax";
     96 $rounds="ecx";
     97 $key="edx";
     98 $inp="esi";
     99 $out="edi";
    100 $rounds_="ebx";	# backup copy for $rounds
    101 $key_="ebp";	# backup copy for $key
    102 
    103 $rndkey0="xmm0";
    104 $rndkey1="xmm1";
    105 $inout0="xmm2";
    106 $inout1="xmm3";
    107 $inout2="xmm4";
    108 $inout3="xmm5";	$in1="xmm5";
    109 $inout4="xmm6";	$in0="xmm6";
    110 $inout5="xmm7";	$ivec="xmm7";
    111 
    112 # AESNI extension
    113 sub aeskeygenassist
    114 { my($dst,$src,$imm)=@_;
    115     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    116     {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
    117 }
    118 sub aescommon
    119 { my($opcodelet,$dst,$src)=@_;
    120     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    121     {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
    122 }
    123 sub aesimc	{ aescommon(0xdb,@_); }
    124 sub aesenc	{ aescommon(0xdc,@_); }
    125 sub aesenclast	{ aescommon(0xdd,@_); }
    126 sub aesdec	{ aescommon(0xde,@_); }
    127 sub aesdeclast	{ aescommon(0xdf,@_); }
    128 
    130 # Inline version of internal aesni_[en|de]crypt1
    131 { my $sn;
    132 sub aesni_inline_generate1
    133 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
    134   $sn++;
    135 
    136     &$movekey		($rndkey0,&QWP(0,$key));
    137     &$movekey		($rndkey1,&QWP(16,$key));
    138     &xorps		($ivec,$rndkey0)	if (defined($ivec));
    139     &lea		($key,&DWP(32,$key));
    140     &xorps		($inout,$ivec)		if (defined($ivec));
    141     &xorps		($inout,$rndkey0)	if (!defined($ivec));
    142     &set_label("${p}1_loop_$sn");
    143 	eval"&aes${p}	($inout,$rndkey1)";
    144 	&dec		($rounds);
    145 	&$movekey	($rndkey1,&QWP(0,$key));
    146 	&lea		($key,&DWP(16,$key));
    147     &jnz		(&label("${p}1_loop_$sn"));
    148     eval"&aes${p}last	($inout,$rndkey1)";
    149 }}
    150 
    151 sub aesni_generate1	# fully unrolled loop
    152 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
    153 
    154     &function_begin_B("_aesni_${p}rypt1");
    155 	&movups		($rndkey0,&QWP(0,$key));
    156 	&$movekey	($rndkey1,&QWP(0x10,$key));
    157 	&xorps		($inout,$rndkey0);
    158 	&$movekey	($rndkey0,&QWP(0x20,$key));
    159 	&lea		($key,&DWP(0x30,$key));
    160 	&cmp		($rounds,11);
    161 	&jb		(&label("${p}128"));
    162 	&lea		($key,&DWP(0x20,$key));
    163 	&je		(&label("${p}192"));
    164 	&lea		($key,&DWP(0x20,$key));
    165 	eval"&aes${p}	($inout,$rndkey1)";
    166 	&$movekey	($rndkey1,&QWP(-0x40,$key));
    167 	eval"&aes${p}	($inout,$rndkey0)";
    168 	&$movekey	($rndkey0,&QWP(-0x30,$key));
    169     &set_label("${p}192");
    170 	eval"&aes${p}	($inout,$rndkey1)";
    171 	&$movekey	($rndkey1,&QWP(-0x20,$key));
    172 	eval"&aes${p}	($inout,$rndkey0)";
    173 	&$movekey	($rndkey0,&QWP(-0x10,$key));
    174     &set_label("${p}128");
    175 	eval"&aes${p}	($inout,$rndkey1)";
    176 	&$movekey	($rndkey1,&QWP(0,$key));
    177 	eval"&aes${p}	($inout,$rndkey0)";
    178 	&$movekey	($rndkey0,&QWP(0x10,$key));
    179 	eval"&aes${p}	($inout,$rndkey1)";
    180 	&$movekey	($rndkey1,&QWP(0x20,$key));
    181 	eval"&aes${p}	($inout,$rndkey0)";
    182 	&$movekey	($rndkey0,&QWP(0x30,$key));
    183 	eval"&aes${p}	($inout,$rndkey1)";
    184 	&$movekey	($rndkey1,&QWP(0x40,$key));
    185 	eval"&aes${p}	($inout,$rndkey0)";
    186 	&$movekey	($rndkey0,&QWP(0x50,$key));
    187 	eval"&aes${p}	($inout,$rndkey1)";
    188 	&$movekey	($rndkey1,&QWP(0x60,$key));
    189 	eval"&aes${p}	($inout,$rndkey0)";
    190 	&$movekey	($rndkey0,&QWP(0x70,$key));
    191 	eval"&aes${p}	($inout,$rndkey1)";
    192     eval"&aes${p}last	($inout,$rndkey0)";
    193     &ret();
    194     &function_end_B("_aesni_${p}rypt1");
    195 }
    196 
    198 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
    199 &aesni_generate1("enc") if (!$inline);
    200 &function_begin_B("${PREFIX}_encrypt");
    201 	&record_function_hit(1);
    202 
    203 	&mov	("eax",&wparam(0));
    204 	&mov	($key,&wparam(2));
    205 	&movups	($inout0,&QWP(0,"eax"));
    206 	&mov	($rounds,&DWP(240,$key));
    207 	&mov	("eax",&wparam(1));
    208 	if ($inline)
    209 	{   &aesni_inline_generate1("enc");	}
    210 	else
    211 	{   &call	("_aesni_encrypt1");	}
    212 	&pxor	($rndkey0,$rndkey0);		# clear register bank
    213 	&pxor	($rndkey1,$rndkey1);
    214 	&movups	(&QWP(0,"eax"),$inout0);
    215 	&pxor	($inout0,$inout0);
    216 	&ret	();
    217 &function_end_B("${PREFIX}_encrypt");
    218 
    219 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
    220 &aesni_generate1("dec") if(!$inline);
    221 &function_begin_B("${PREFIX}_decrypt");
    222 	&mov	("eax",&wparam(0));
    223 	&mov	($key,&wparam(2));
    224 	&movups	($inout0,&QWP(0,"eax"));
    225 	&mov	($rounds,&DWP(240,$key));
    226 	&mov	("eax",&wparam(1));
    227 	if ($inline)
    228 	{   &aesni_inline_generate1("dec");	}
    229 	else
    230 	{   &call	("_aesni_decrypt1");	}
    231 	&pxor	($rndkey0,$rndkey0);		# clear register bank
    232 	&pxor	($rndkey1,$rndkey1);
    233 	&movups	(&QWP(0,"eax"),$inout0);
    234 	&pxor	($inout0,$inout0);
    235 	&ret	();
    236 &function_end_B("${PREFIX}_decrypt");
    237 
    238 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
    239 # factor. Why 3x subroutine were originally used in loops? Even though
    240 # aes[enc|dec] latency was originally 6, it could be scheduled only
    241 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
    242 # utilization, i.e. when subroutine's throughput is virtually same as
    243 # of non-interleaved subroutine [for number of input blocks up to 3].
    244 # This is why it originally made no sense to implement 2x subroutine.
    245 # But times change and it became appropriate to spend extra 192 bytes
    246 # on 2x subroutine on Atom Silvermont account. For processors that
    247 # can schedule aes[enc|dec] every cycle optimal interleave factor
    248 # equals to corresponding instructions latency. 8x is optimal for
    249 # * Bridge, but it's unfeasible to accommodate such implementation
    250 # in XMM registers addressable in 32-bit mode and therefore maximum
    251 # of 6x is used instead...
    252 
    253 sub aesni_generate2
    254 { my $p=shift;
    255 
    256     &function_begin_B("_aesni_${p}rypt2");
    257 	&$movekey	($rndkey0,&QWP(0,$key));
    258 	&shl		($rounds,4);
    259 	&$movekey	($rndkey1,&QWP(16,$key));
    260 	&xorps		($inout0,$rndkey0);
    261 	&pxor		($inout1,$rndkey0);
    262 	&$movekey	($rndkey0,&QWP(32,$key));
    263 	&lea		($key,&DWP(32,$key,$rounds));
    264 	&neg		($rounds);
    265 	&add		($rounds,16);
    266 
    267     &set_label("${p}2_loop");
    268 	eval"&aes${p}	($inout0,$rndkey1)";
    269 	eval"&aes${p}	($inout1,$rndkey1)";
    270 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    271 	&add		($rounds,32);
    272 	eval"&aes${p}	($inout0,$rndkey0)";
    273 	eval"&aes${p}	($inout1,$rndkey0)";
    274 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    275 	&jnz		(&label("${p}2_loop"));
    276     eval"&aes${p}	($inout0,$rndkey1)";
    277     eval"&aes${p}	($inout1,$rndkey1)";
    278     eval"&aes${p}last	($inout0,$rndkey0)";
    279     eval"&aes${p}last	($inout1,$rndkey0)";
    280     &ret();
    281     &function_end_B("_aesni_${p}rypt2");
    282 }
    283 
    284 sub aesni_generate3
    285 { my $p=shift;
    286 
    287     &function_begin_B("_aesni_${p}rypt3");
    288 	&$movekey	($rndkey0,&QWP(0,$key));
    289 	&shl		($rounds,4);
    290 	&$movekey	($rndkey1,&QWP(16,$key));
    291 	&xorps		($inout0,$rndkey0);
    292 	&pxor		($inout1,$rndkey0);
    293 	&pxor		($inout2,$rndkey0);
    294 	&$movekey	($rndkey0,&QWP(32,$key));
    295 	&lea		($key,&DWP(32,$key,$rounds));
    296 	&neg		($rounds);
    297 	&add		($rounds,16);
    298 
    299     &set_label("${p}3_loop");
    300 	eval"&aes${p}	($inout0,$rndkey1)";
    301 	eval"&aes${p}	($inout1,$rndkey1)";
    302 	eval"&aes${p}	($inout2,$rndkey1)";
    303 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    304 	&add		($rounds,32);
    305 	eval"&aes${p}	($inout0,$rndkey0)";
    306 	eval"&aes${p}	($inout1,$rndkey0)";
    307 	eval"&aes${p}	($inout2,$rndkey0)";
    308 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    309 	&jnz		(&label("${p}3_loop"));
    310     eval"&aes${p}	($inout0,$rndkey1)";
    311     eval"&aes${p}	($inout1,$rndkey1)";
    312     eval"&aes${p}	($inout2,$rndkey1)";
    313     eval"&aes${p}last	($inout0,$rndkey0)";
    314     eval"&aes${p}last	($inout1,$rndkey0)";
    315     eval"&aes${p}last	($inout2,$rndkey0)";
    316     &ret();
    317     &function_end_B("_aesni_${p}rypt3");
    318 }
    319 
    320 # 4x interleave is implemented to improve small block performance,
    321 # most notably [and naturally] 4 block by ~30%. One can argue that one
    322 # should have implemented 5x as well, but improvement  would be <20%,
    323 # so it's not worth it...
    324 sub aesni_generate4
    325 { my $p=shift;
    326 
    327     &function_begin_B("_aesni_${p}rypt4");
    328 	&$movekey	($rndkey0,&QWP(0,$key));
    329 	&$movekey	($rndkey1,&QWP(16,$key));
    330 	&shl		($rounds,4);
    331 	&xorps		($inout0,$rndkey0);
    332 	&pxor		($inout1,$rndkey0);
    333 	&pxor		($inout2,$rndkey0);
    334 	&pxor		($inout3,$rndkey0);
    335 	&$movekey	($rndkey0,&QWP(32,$key));
    336 	&lea		($key,&DWP(32,$key,$rounds));
    337 	&neg		($rounds);
    338 	&data_byte	(0x0f,0x1f,0x40,0x00);
    339 	&add		($rounds,16);
    340 
    341     &set_label("${p}4_loop");
    342 	eval"&aes${p}	($inout0,$rndkey1)";
    343 	eval"&aes${p}	($inout1,$rndkey1)";
    344 	eval"&aes${p}	($inout2,$rndkey1)";
    345 	eval"&aes${p}	($inout3,$rndkey1)";
    346 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    347 	&add		($rounds,32);
    348 	eval"&aes${p}	($inout0,$rndkey0)";
    349 	eval"&aes${p}	($inout1,$rndkey0)";
    350 	eval"&aes${p}	($inout2,$rndkey0)";
    351 	eval"&aes${p}	($inout3,$rndkey0)";
    352 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    353     &jnz		(&label("${p}4_loop"));
    354 
    355     eval"&aes${p}	($inout0,$rndkey1)";
    356     eval"&aes${p}	($inout1,$rndkey1)";
    357     eval"&aes${p}	($inout2,$rndkey1)";
    358     eval"&aes${p}	($inout3,$rndkey1)";
    359     eval"&aes${p}last	($inout0,$rndkey0)";
    360     eval"&aes${p}last	($inout1,$rndkey0)";
    361     eval"&aes${p}last	($inout2,$rndkey0)";
    362     eval"&aes${p}last	($inout3,$rndkey0)";
    363     &ret();
    364     &function_end_B("_aesni_${p}rypt4");
    365 }
    366 
    367 sub aesni_generate6
    368 { my $p=shift;
    369 
    370     &function_begin_B("_aesni_${p}rypt6");
    371     &static_label("_aesni_${p}rypt6_enter");
    372 	&$movekey	($rndkey0,&QWP(0,$key));
    373 	&shl		($rounds,4);
    374 	&$movekey	($rndkey1,&QWP(16,$key));
    375 	&xorps		($inout0,$rndkey0);
    376 	&pxor		($inout1,$rndkey0);	# pxor does better here
    377 	&pxor		($inout2,$rndkey0);
    378 	eval"&aes${p}	($inout0,$rndkey1)";
    379 	&pxor		($inout3,$rndkey0);
    380 	&pxor		($inout4,$rndkey0);
    381 	eval"&aes${p}	($inout1,$rndkey1)";
    382 	&lea		($key,&DWP(32,$key,$rounds));
    383 	&neg		($rounds);
    384 	eval"&aes${p}	($inout2,$rndkey1)";
    385 	&pxor		($inout5,$rndkey0);
    386 	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
    387 	&add		($rounds,16);
    388 	&jmp		(&label("_aesni_${p}rypt6_inner"));
    389 
    390     &set_label("${p}6_loop",16);
    391 	eval"&aes${p}	($inout0,$rndkey1)";
    392 	eval"&aes${p}	($inout1,$rndkey1)";
    393 	eval"&aes${p}	($inout2,$rndkey1)";
    394     &set_label("_aesni_${p}rypt6_inner");
    395 	eval"&aes${p}	($inout3,$rndkey1)";
    396 	eval"&aes${p}	($inout4,$rndkey1)";
    397 	eval"&aes${p}	($inout5,$rndkey1)";
    398     &set_label("_aesni_${p}rypt6_enter");
    399 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    400 	&add		($rounds,32);
    401 	eval"&aes${p}	($inout0,$rndkey0)";
    402 	eval"&aes${p}	($inout1,$rndkey0)";
    403 	eval"&aes${p}	($inout2,$rndkey0)";
    404 	eval"&aes${p}	($inout3,$rndkey0)";
    405 	eval"&aes${p}	($inout4,$rndkey0)";
    406 	eval"&aes${p}	($inout5,$rndkey0)";
    407 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    408     &jnz		(&label("${p}6_loop"));
    409 
    410     eval"&aes${p}	($inout0,$rndkey1)";
    411     eval"&aes${p}	($inout1,$rndkey1)";
    412     eval"&aes${p}	($inout2,$rndkey1)";
    413     eval"&aes${p}	($inout3,$rndkey1)";
    414     eval"&aes${p}	($inout4,$rndkey1)";
    415     eval"&aes${p}	($inout5,$rndkey1)";
    416     eval"&aes${p}last	($inout0,$rndkey0)";
    417     eval"&aes${p}last	($inout1,$rndkey0)";
    418     eval"&aes${p}last	($inout2,$rndkey0)";
    419     eval"&aes${p}last	($inout3,$rndkey0)";
    420     eval"&aes${p}last	($inout4,$rndkey0)";
    421     eval"&aes${p}last	($inout5,$rndkey0)";
    422     &ret();
    423     &function_end_B("_aesni_${p}rypt6");
    424 }
    425 &aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
    426 &aesni_generate2("dec");
    427 &aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
    428 &aesni_generate3("dec");
    429 &aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
    430 &aesni_generate4("dec");
    431 &aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
    432 &aesni_generate6("dec");
    433 
    435 if ($PREFIX eq $AESNI_PREFIX) {
    436 ######################################################################
    437 # void aes_hw_ecb_encrypt (const void *in, void *out,
    438 #                         size_t length, const AES_KEY *key,
    439 #                         int enc);
    440 &function_begin("${PREFIX}_ecb_encrypt");
    441 	&mov	($inp,&wparam(0));
    442 	&mov	($out,&wparam(1));
    443 	&mov	($len,&wparam(2));
    444 	&mov	($key,&wparam(3));
    445 	&mov	($rounds_,&wparam(4));
    446 	&and	($len,-16);
    447 	&jz	(&label("ecb_ret"));
    448 	&mov	($rounds,&DWP(240,$key));
    449 	&test	($rounds_,$rounds_);
    450 	&jz	(&label("ecb_decrypt"));
    451 
    452 	&mov	($key_,$key);		# backup $key
    453 	&mov	($rounds_,$rounds);	# backup $rounds
    454 	&cmp	($len,0x60);
    455 	&jb	(&label("ecb_enc_tail"));
    456 
    457 	&movdqu	($inout0,&QWP(0,$inp));
    458 	&movdqu	($inout1,&QWP(0x10,$inp));
    459 	&movdqu	($inout2,&QWP(0x20,$inp));
    460 	&movdqu	($inout3,&QWP(0x30,$inp));
    461 	&movdqu	($inout4,&QWP(0x40,$inp));
    462 	&movdqu	($inout5,&QWP(0x50,$inp));
    463 	&lea	($inp,&DWP(0x60,$inp));
    464 	&sub	($len,0x60);
    465 	&jmp	(&label("ecb_enc_loop6_enter"));
    466 
    467 &set_label("ecb_enc_loop6",16);
    468 	&movups	(&QWP(0,$out),$inout0);
    469 	&movdqu	($inout0,&QWP(0,$inp));
    470 	&movups	(&QWP(0x10,$out),$inout1);
    471 	&movdqu	($inout1,&QWP(0x10,$inp));
    472 	&movups	(&QWP(0x20,$out),$inout2);
    473 	&movdqu	($inout2,&QWP(0x20,$inp));
    474 	&movups	(&QWP(0x30,$out),$inout3);
    475 	&movdqu	($inout3,&QWP(0x30,$inp));
    476 	&movups	(&QWP(0x40,$out),$inout4);
    477 	&movdqu	($inout4,&QWP(0x40,$inp));
    478 	&movups	(&QWP(0x50,$out),$inout5);
    479 	&lea	($out,&DWP(0x60,$out));
    480 	&movdqu	($inout5,&QWP(0x50,$inp));
    481 	&lea	($inp,&DWP(0x60,$inp));
    482 &set_label("ecb_enc_loop6_enter");
    483 
    484 	&call	("_aesni_encrypt6");
    485 
    486 	&mov	($key,$key_);		# restore $key
    487 	&mov	($rounds,$rounds_);	# restore $rounds
    488 	&sub	($len,0x60);
    489 	&jnc	(&label("ecb_enc_loop6"));
    490 
    491 	&movups	(&QWP(0,$out),$inout0);
    492 	&movups	(&QWP(0x10,$out),$inout1);
    493 	&movups	(&QWP(0x20,$out),$inout2);
    494 	&movups	(&QWP(0x30,$out),$inout3);
    495 	&movups	(&QWP(0x40,$out),$inout4);
    496 	&movups	(&QWP(0x50,$out),$inout5);
    497 	&lea	($out,&DWP(0x60,$out));
    498 	&add	($len,0x60);
    499 	&jz	(&label("ecb_ret"));
    500 
    501 &set_label("ecb_enc_tail");
    502 	&movups	($inout0,&QWP(0,$inp));
    503 	&cmp	($len,0x20);
    504 	&jb	(&label("ecb_enc_one"));
    505 	&movups	($inout1,&QWP(0x10,$inp));
    506 	&je	(&label("ecb_enc_two"));
    507 	&movups	($inout2,&QWP(0x20,$inp));
    508 	&cmp	($len,0x40);
    509 	&jb	(&label("ecb_enc_three"));
    510 	&movups	($inout3,&QWP(0x30,$inp));
    511 	&je	(&label("ecb_enc_four"));
    512 	&movups	($inout4,&QWP(0x40,$inp));
    513 	&xorps	($inout5,$inout5);
    514 	&call	("_aesni_encrypt6");
    515 	&movups	(&QWP(0,$out),$inout0);
    516 	&movups	(&QWP(0x10,$out),$inout1);
    517 	&movups	(&QWP(0x20,$out),$inout2);
    518 	&movups	(&QWP(0x30,$out),$inout3);
    519 	&movups	(&QWP(0x40,$out),$inout4);
    520 	jmp	(&label("ecb_ret"));
    521 
    522 &set_label("ecb_enc_one",16);
    523 	if ($inline)
    524 	{   &aesni_inline_generate1("enc");	}
    525 	else
    526 	{   &call	("_aesni_encrypt1");	}
    527 	&movups	(&QWP(0,$out),$inout0);
    528 	&jmp	(&label("ecb_ret"));
    529 
    530 &set_label("ecb_enc_two",16);
    531 	&call	("_aesni_encrypt2");
    532 	&movups	(&QWP(0,$out),$inout0);
    533 	&movups	(&QWP(0x10,$out),$inout1);
    534 	&jmp	(&label("ecb_ret"));
    535 
    536 &set_label("ecb_enc_three",16);
    537 	&call	("_aesni_encrypt3");
    538 	&movups	(&QWP(0,$out),$inout0);
    539 	&movups	(&QWP(0x10,$out),$inout1);
    540 	&movups	(&QWP(0x20,$out),$inout2);
    541 	&jmp	(&label("ecb_ret"));
    542 
    543 &set_label("ecb_enc_four",16);
    544 	&call	("_aesni_encrypt4");
    545 	&movups	(&QWP(0,$out),$inout0);
    546 	&movups	(&QWP(0x10,$out),$inout1);
    547 	&movups	(&QWP(0x20,$out),$inout2);
    548 	&movups	(&QWP(0x30,$out),$inout3);
    549 	&jmp	(&label("ecb_ret"));
    550 ######################################################################
    551 &set_label("ecb_decrypt",16);
    552 	&mov	($key_,$key);		# backup $key
    553 	&mov	($rounds_,$rounds);	# backup $rounds
    554 	&cmp	($len,0x60);
    555 	&jb	(&label("ecb_dec_tail"));
    556 
    557 	&movdqu	($inout0,&QWP(0,$inp));
    558 	&movdqu	($inout1,&QWP(0x10,$inp));
    559 	&movdqu	($inout2,&QWP(0x20,$inp));
    560 	&movdqu	($inout3,&QWP(0x30,$inp));
    561 	&movdqu	($inout4,&QWP(0x40,$inp));
    562 	&movdqu	($inout5,&QWP(0x50,$inp));
    563 	&lea	($inp,&DWP(0x60,$inp));
    564 	&sub	($len,0x60);
    565 	&jmp	(&label("ecb_dec_loop6_enter"));
    566 
    567 &set_label("ecb_dec_loop6",16);
    568 	&movups	(&QWP(0,$out),$inout0);
    569 	&movdqu	($inout0,&QWP(0,$inp));
    570 	&movups	(&QWP(0x10,$out),$inout1);
    571 	&movdqu	($inout1,&QWP(0x10,$inp));
    572 	&movups	(&QWP(0x20,$out),$inout2);
    573 	&movdqu	($inout2,&QWP(0x20,$inp));
    574 	&movups	(&QWP(0x30,$out),$inout3);
    575 	&movdqu	($inout3,&QWP(0x30,$inp));
    576 	&movups	(&QWP(0x40,$out),$inout4);
    577 	&movdqu	($inout4,&QWP(0x40,$inp));
    578 	&movups	(&QWP(0x50,$out),$inout5);
    579 	&lea	($out,&DWP(0x60,$out));
    580 	&movdqu	($inout5,&QWP(0x50,$inp));
    581 	&lea	($inp,&DWP(0x60,$inp));
    582 &set_label("ecb_dec_loop6_enter");
    583 
    584 	&call	("_aesni_decrypt6");
    585 
    586 	&mov	($key,$key_);		# restore $key
    587 	&mov	($rounds,$rounds_);	# restore $rounds
    588 	&sub	($len,0x60);
    589 	&jnc	(&label("ecb_dec_loop6"));
    590 
    591 	&movups	(&QWP(0,$out),$inout0);
    592 	&movups	(&QWP(0x10,$out),$inout1);
    593 	&movups	(&QWP(0x20,$out),$inout2);
    594 	&movups	(&QWP(0x30,$out),$inout3);
    595 	&movups	(&QWP(0x40,$out),$inout4);
    596 	&movups	(&QWP(0x50,$out),$inout5);
    597 	&lea	($out,&DWP(0x60,$out));
    598 	&add	($len,0x60);
    599 	&jz	(&label("ecb_ret"));
    600 
    601 &set_label("ecb_dec_tail");
    602 	&movups	($inout0,&QWP(0,$inp));
    603 	&cmp	($len,0x20);
    604 	&jb	(&label("ecb_dec_one"));
    605 	&movups	($inout1,&QWP(0x10,$inp));
    606 	&je	(&label("ecb_dec_two"));
    607 	&movups	($inout2,&QWP(0x20,$inp));
    608 	&cmp	($len,0x40);
    609 	&jb	(&label("ecb_dec_three"));
    610 	&movups	($inout3,&QWP(0x30,$inp));
    611 	&je	(&label("ecb_dec_four"));
    612 	&movups	($inout4,&QWP(0x40,$inp));
    613 	&xorps	($inout5,$inout5);
    614 	&call	("_aesni_decrypt6");
    615 	&movups	(&QWP(0,$out),$inout0);
    616 	&movups	(&QWP(0x10,$out),$inout1);
    617 	&movups	(&QWP(0x20,$out),$inout2);
    618 	&movups	(&QWP(0x30,$out),$inout3);
    619 	&movups	(&QWP(0x40,$out),$inout4);
    620 	&jmp	(&label("ecb_ret"));
    621 
    622 &set_label("ecb_dec_one",16);
    623 	if ($inline)
    624 	{   &aesni_inline_generate1("dec");	}
    625 	else
    626 	{   &call	("_aesni_decrypt1");	}
    627 	&movups	(&QWP(0,$out),$inout0);
    628 	&jmp	(&label("ecb_ret"));
    629 
    630 &set_label("ecb_dec_two",16);
    631 	&call	("_aesni_decrypt2");
    632 	&movups	(&QWP(0,$out),$inout0);
    633 	&movups	(&QWP(0x10,$out),$inout1);
    634 	&jmp	(&label("ecb_ret"));
    635 
    636 &set_label("ecb_dec_three",16);
    637 	&call	("_aesni_decrypt3");
    638 	&movups	(&QWP(0,$out),$inout0);
    639 	&movups	(&QWP(0x10,$out),$inout1);
    640 	&movups	(&QWP(0x20,$out),$inout2);
    641 	&jmp	(&label("ecb_ret"));
    642 
    643 &set_label("ecb_dec_four",16);
    644 	&call	("_aesni_decrypt4");
    645 	&movups	(&QWP(0,$out),$inout0);
    646 	&movups	(&QWP(0x10,$out),$inout1);
    647 	&movups	(&QWP(0x20,$out),$inout2);
    648 	&movups	(&QWP(0x30,$out),$inout3);
    649 
    650 &set_label("ecb_ret");
    651 	&pxor	("xmm0","xmm0");		# clear register bank
    652 	&pxor	("xmm1","xmm1");
    653 	&pxor	("xmm2","xmm2");
    654 	&pxor	("xmm3","xmm3");
    655 	&pxor	("xmm4","xmm4");
    656 	&pxor	("xmm5","xmm5");
    657 	&pxor	("xmm6","xmm6");
    658 	&pxor	("xmm7","xmm7");
    659 &function_end("${PREFIX}_ecb_encrypt");
    660 
    662 ######################################################################
    663 # void aes_hw_ccm64_[en|de]crypt_blocks (const void *in, void *out,
    664 #                         size_t blocks, const AES_KEY *key,
    665 #                         const char *ivec,char *cmac);
    666 #
    667 # Handles only complete blocks, operates on 64-bit counter and
    668 # does not update *ivec! Nor does it finalize CMAC value
    669 # (see engine/eng_aesni.c for details)
    670 #
    671 { my $cmac=$inout1;
    672 &function_begin("${PREFIX}_ccm64_encrypt_blocks");
    673 	&mov	($inp,&wparam(0));
    674 	&mov	($out,&wparam(1));
    675 	&mov	($len,&wparam(2));
    676 	&mov	($key,&wparam(3));
    677 	&mov	($rounds_,&wparam(4));
    678 	&mov	($rounds,&wparam(5));
    679 	&mov	($key_,"esp");
    680 	&sub	("esp",60);
    681 	&and	("esp",-16);			# align stack
    682 	&mov	(&DWP(48,"esp"),$key_);
    683 
    684 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
    685 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
    686 	&mov	($rounds,&DWP(240,$key));
    687 
    688 	# compose byte-swap control mask for pshufb on stack
    689 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    690 	&mov	(&DWP(4,"esp"),0x08090a0b);
    691 	&mov	(&DWP(8,"esp"),0x04050607);
    692 	&mov	(&DWP(12,"esp"),0x00010203);
    693 
    694 	# compose counter increment vector on stack
    695 	&mov	($rounds_,1);
    696 	&xor	($key_,$key_);
    697 	&mov	(&DWP(16,"esp"),$rounds_);
    698 	&mov	(&DWP(20,"esp"),$key_);
    699 	&mov	(&DWP(24,"esp"),$key_);
    700 	&mov	(&DWP(28,"esp"),$key_);
    701 
    702 	&shl	($rounds,4);
    703 	&mov	($rounds_,16);
    704 	&lea	($key_,&DWP(0,$key));
    705 	&movdqa	($inout3,&QWP(0,"esp"));
    706 	&movdqa	($inout0,$ivec);
    707 	&lea	($key,&DWP(32,$key,$rounds));
    708 	&sub	($rounds_,$rounds);
    709 	&pshufb	($ivec,$inout3);
    710 
    711 &set_label("ccm64_enc_outer");
    712 	&$movekey	($rndkey0,&QWP(0,$key_));
    713 	&mov		($rounds,$rounds_);
    714 	&movups		($in0,&QWP(0,$inp));
    715 
    716 	&xorps		($inout0,$rndkey0);
    717 	&$movekey	($rndkey1,&QWP(16,$key_));
    718 	&xorps		($rndkey0,$in0);
    719 	&xorps		($cmac,$rndkey0);		# cmac^=inp
    720 	&$movekey	($rndkey0,&QWP(32,$key_));
    721 
    722 &set_label("ccm64_enc2_loop");
    723 	&aesenc		($inout0,$rndkey1);
    724 	&aesenc		($cmac,$rndkey1);
    725 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    726 	&add		($rounds,32);
    727 	&aesenc		($inout0,$rndkey0);
    728 	&aesenc		($cmac,$rndkey0);
    729 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    730 	&jnz		(&label("ccm64_enc2_loop"));
    731 	&aesenc		($inout0,$rndkey1);
    732 	&aesenc		($cmac,$rndkey1);
    733 	&paddq		($ivec,&QWP(16,"esp"));
    734 	&dec		($len);
    735 	&aesenclast	($inout0,$rndkey0);
    736 	&aesenclast	($cmac,$rndkey0);
    737 
    738 	&lea	($inp,&DWP(16,$inp));
    739 	&xorps	($in0,$inout0);			# inp^=E(ivec)
    740 	&movdqa	($inout0,$ivec);
    741 	&movups	(&QWP(0,$out),$in0);		# save output
    742 	&pshufb	($inout0,$inout3);
    743 	&lea	($out,&DWP(16,$out));
    744 	&jnz	(&label("ccm64_enc_outer"));
    745 
    746 	&mov	("esp",&DWP(48,"esp"));
    747 	&mov	($out,&wparam(5));
    748 	&movups	(&QWP(0,$out),$cmac);
    749 
    750 	&pxor	("xmm0","xmm0");		# clear register bank
    751 	&pxor	("xmm1","xmm1");
    752 	&pxor	("xmm2","xmm2");
    753 	&pxor	("xmm3","xmm3");
    754 	&pxor	("xmm4","xmm4");
    755 	&pxor	("xmm5","xmm5");
    756 	&pxor	("xmm6","xmm6");
    757 	&pxor	("xmm7","xmm7");
    758 &function_end("${PREFIX}_ccm64_encrypt_blocks");
    759 
    760 &function_begin("${PREFIX}_ccm64_decrypt_blocks");
    761 	&mov	($inp,&wparam(0));
    762 	&mov	($out,&wparam(1));
    763 	&mov	($len,&wparam(2));
    764 	&mov	($key,&wparam(3));
    765 	&mov	($rounds_,&wparam(4));
    766 	&mov	($rounds,&wparam(5));
    767 	&mov	($key_,"esp");
    768 	&sub	("esp",60);
    769 	&and	("esp",-16);			# align stack
    770 	&mov	(&DWP(48,"esp"),$key_);
    771 
    772 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
    773 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
    774 	&mov	($rounds,&DWP(240,$key));
    775 
    776 	# compose byte-swap control mask for pshufb on stack
    777 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    778 	&mov	(&DWP(4,"esp"),0x08090a0b);
    779 	&mov	(&DWP(8,"esp"),0x04050607);
    780 	&mov	(&DWP(12,"esp"),0x00010203);
    781 
    782 	# compose counter increment vector on stack
    783 	&mov	($rounds_,1);
    784 	&xor	($key_,$key_);
    785 	&mov	(&DWP(16,"esp"),$rounds_);
    786 	&mov	(&DWP(20,"esp"),$key_);
    787 	&mov	(&DWP(24,"esp"),$key_);
    788 	&mov	(&DWP(28,"esp"),$key_);
    789 
    790 	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
    791 	&movdqa	($inout0,$ivec);
    792 
    793 	&mov	($key_,$key);
    794 	&mov	($rounds_,$rounds);
    795 
    796 	&pshufb	($ivec,$inout3);
    797 	if ($inline)
    798 	{   &aesni_inline_generate1("enc");	}
    799 	else
    800 	{   &call	("_aesni_encrypt1");	}
    801 	&shl	($rounds_,4);
    802 	&mov	($rounds,16);
    803 	&movups	($in0,&QWP(0,$inp));		# load inp
    804 	&paddq	($ivec,&QWP(16,"esp"));
    805 	&lea	($inp,&QWP(16,$inp));
    806 	&sub	($rounds,$rounds_);
    807 	&lea	($key,&DWP(32,$key_,$rounds_));
    808 	&mov	($rounds_,$rounds);
    809 	&jmp	(&label("ccm64_dec_outer"));
    810 
    811 &set_label("ccm64_dec_outer",16);
    812 	&xorps	($in0,$inout0);			# inp ^= E(ivec)
    813 	&movdqa	($inout0,$ivec);
    814 	&movups	(&QWP(0,$out),$in0);		# save output
    815 	&lea	($out,&DWP(16,$out));
    816 	&pshufb	($inout0,$inout3);
    817 
    818 	&sub	($len,1);
    819 	&jz	(&label("ccm64_dec_break"));
    820 
    821 	&$movekey	($rndkey0,&QWP(0,$key_));
    822 	&mov		($rounds,$rounds_);
    823 	&$movekey	($rndkey1,&QWP(16,$key_));
    824 	&xorps		($in0,$rndkey0);
    825 	&xorps		($inout0,$rndkey0);
    826 	&xorps		($cmac,$in0);		# cmac^=out
    827 	&$movekey	($rndkey0,&QWP(32,$key_));
    828 
    829 &set_label("ccm64_dec2_loop");
    830 	&aesenc		($inout0,$rndkey1);
    831 	&aesenc		($cmac,$rndkey1);
    832 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    833 	&add		($rounds,32);
    834 	&aesenc		($inout0,$rndkey0);
    835 	&aesenc		($cmac,$rndkey0);
    836 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    837 	&jnz		(&label("ccm64_dec2_loop"));
    838 	&movups		($in0,&QWP(0,$inp));	# load inp
    839 	&paddq		($ivec,&QWP(16,"esp"));
    840 	&aesenc		($inout0,$rndkey1);
    841 	&aesenc		($cmac,$rndkey1);
    842 	&aesenclast	($inout0,$rndkey0);
    843 	&aesenclast	($cmac,$rndkey0);
    844 	&lea		($inp,&QWP(16,$inp));
    845 	&jmp	(&label("ccm64_dec_outer"));
    846 
    847 &set_label("ccm64_dec_break",16);
    848 	&mov	($rounds,&DWP(240,$key_));
    849 	&mov	($key,$key_);
    850 	if ($inline)
    851 	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
    852 	else
    853 	{   &call	("_aesni_encrypt1",$cmac);	}
    854 
    855 	&mov	("esp",&DWP(48,"esp"));
    856 	&mov	($out,&wparam(5));
    857 	&movups	(&QWP(0,$out),$cmac);
    858 
    859 	&pxor	("xmm0","xmm0");		# clear register bank
    860 	&pxor	("xmm1","xmm1");
    861 	&pxor	("xmm2","xmm2");
    862 	&pxor	("xmm3","xmm3");
    863 	&pxor	("xmm4","xmm4");
    864 	&pxor	("xmm5","xmm5");
    865 	&pxor	("xmm6","xmm6");
    866 	&pxor	("xmm7","xmm7");
    867 &function_end("${PREFIX}_ccm64_decrypt_blocks");
    868 }
    869 
    871 ######################################################################
    872 # void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
    873 #                         size_t blocks, const AES_KEY *key,
    874 #                         const char *ivec);
    875 #
    876 # Handles only complete blocks, operates on 32-bit counter and
    877 # does not update *ivec! (see crypto/modes/ctr128.c for details)
    878 #
    879 # stack layout:
    880 #	0	pshufb mask
    881 #	16	vector addend: 0,6,6,6
    882 # 	32	counter-less ivec
    883 #	48	1st triplet of counter vector
    884 #	64	2nd triplet of counter vector
    885 #	80	saved %esp
    886 
    887 &function_begin("${PREFIX}_ctr32_encrypt_blocks");
    888 	&record_function_hit(0);
    889 
    890 	&mov	($inp,&wparam(0));
    891 	&mov	($out,&wparam(1));
    892 	&mov	($len,&wparam(2));
    893 	&mov	($key,&wparam(3));
    894 	&mov	($rounds_,&wparam(4));
    895 	&mov	($key_,"esp");
    896 	&sub	("esp",88);
    897 	&and	("esp",-16);			# align stack
    898 	&mov	(&DWP(80,"esp"),$key_);
    899 
    900 	&cmp	($len,1);
    901 	&je	(&label("ctr32_one_shortcut"));
    902 
    903 	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
    904 
    905 	# compose byte-swap control mask for pshufb on stack
    906 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    907 	&mov	(&DWP(4,"esp"),0x08090a0b);
    908 	&mov	(&DWP(8,"esp"),0x04050607);
    909 	&mov	(&DWP(12,"esp"),0x00010203);
    910 
    911 	# compose counter increment vector on stack
    912 	&mov	($rounds,6);
    913 	&xor	($key_,$key_);
    914 	&mov	(&DWP(16,"esp"),$rounds);
    915 	&mov	(&DWP(20,"esp"),$rounds);
    916 	&mov	(&DWP(24,"esp"),$rounds);
    917 	&mov	(&DWP(28,"esp"),$key_);
    918 
    919 	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
    920 	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
    921 
    922 	&mov	($rounds,&DWP(240,$key));	# key->rounds
    923 
    924 	# compose 2 vectors of 3x32-bit counters
    925 	&bswap	($rounds_);
    926 	&pxor	($rndkey0,$rndkey0);
    927 	&pxor	($rndkey1,$rndkey1);
    928 	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
    929 	&pinsrd	($rndkey0,$rounds_,0);
    930 	&lea	($key_,&DWP(3,$rounds_));
    931 	&pinsrd	($rndkey1,$key_,0);
    932 	&inc	($rounds_);
    933 	&pinsrd	($rndkey0,$rounds_,1);
    934 	&inc	($key_);
    935 	&pinsrd	($rndkey1,$key_,1);
    936 	&inc	($rounds_);
    937 	&pinsrd	($rndkey0,$rounds_,2);
    938 	&inc	($key_);
    939 	&pinsrd	($rndkey1,$key_,2);
    940 	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
    941 	&pshufb	($rndkey0,$inout0);		# byte swap
    942 	&movdqu	($inout4,&QWP(0,$key));		# key[0]
    943 	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
    944 	&pshufb	($rndkey1,$inout0);		# byte swap
    945 
    946 	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
    947 	&pshufd	($inout1,$rndkey0,2<<6);
    948 	&cmp	($len,6);
    949 	&jb	(&label("ctr32_tail"));
    950 	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
    951 	&shl	($rounds,4);
    952 	&mov	($rounds_,16);
    953 	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
    954 	&mov	($key_,$key);			# backup $key
    955 	&sub	($rounds_,$rounds);		# backup twisted $rounds
    956 	&lea	($key,&DWP(32,$key,$rounds));
    957 	&sub	($len,6);
    958 	&jmp	(&label("ctr32_loop6"));
    959 
    960 &set_label("ctr32_loop6",16);
    961 	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
    962 	&pshufd	($inout2,$rndkey0,1<<6);
    963 	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
    964 	&pshufd	($inout3,$rndkey1,3<<6);
    965 	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
    966 	&pshufd	($inout4,$rndkey1,2<<6);
    967 	&pxor		($inout1,$rndkey0);
    968 	&pshufd	($inout5,$rndkey1,1<<6);
    969 	&$movekey	($rndkey1,&QWP(16,$key_));
    970 	&pxor		($inout2,$rndkey0);
    971 	&pxor		($inout3,$rndkey0);
    972 	&aesenc		($inout0,$rndkey1);
    973 	&pxor		($inout4,$rndkey0);
    974 	&pxor		($inout5,$rndkey0);
    975 	&aesenc		($inout1,$rndkey1);
    976 	&$movekey	($rndkey0,&QWP(32,$key_));
    977 	&mov		($rounds,$rounds_);
    978 	&aesenc		($inout2,$rndkey1);
    979 	&aesenc		($inout3,$rndkey1);
    980 	&aesenc		($inout4,$rndkey1);
    981 	&aesenc		($inout5,$rndkey1);
    982 
    983 	&call		(&label("_aesni_encrypt6_enter"));
    984 
    985 	&movups	($rndkey1,&QWP(0,$inp));
    986 	&movups	($rndkey0,&QWP(0x10,$inp));
    987 	&xorps	($inout0,$rndkey1);
    988 	&movups	($rndkey1,&QWP(0x20,$inp));
    989 	&xorps	($inout1,$rndkey0);
    990 	&movups	(&QWP(0,$out),$inout0);
    991 	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
    992 	&xorps	($inout2,$rndkey1);
    993 	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
    994 	&movups	(&QWP(0x10,$out),$inout1);
    995 	&movups	(&QWP(0x20,$out),$inout2);
    996 
    997 	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
    998 	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
    999 	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
   1000 
   1001 	&movups	($inout1,&QWP(0x30,$inp));
   1002 	&movups	($inout2,&QWP(0x40,$inp));
   1003 	&xorps	($inout3,$inout1);
   1004 	&movups	($inout1,&QWP(0x50,$inp));
   1005 	&lea	($inp,&DWP(0x60,$inp));
   1006 	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
   1007 	&pshufb	($rndkey0,$inout0);		# byte swap
   1008 	&xorps	($inout4,$inout2);
   1009 	&movups	(&QWP(0x30,$out),$inout3);
   1010 	&xorps	($inout5,$inout1);
   1011 	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
   1012 	&pshufb	($rndkey1,$inout0);		# byte swap
   1013 	&movups	(&QWP(0x40,$out),$inout4);
   1014 	&pshufd	($inout0,$rndkey0,3<<6);
   1015 	&movups	(&QWP(0x50,$out),$inout5);
   1016 	&lea	($out,&DWP(0x60,$out));
   1017 
   1018 	&pshufd	($inout1,$rndkey0,2<<6);
   1019 	&sub	($len,6);
   1020 	&jnc	(&label("ctr32_loop6"));
   1021 
   1022 	&add	($len,6);
   1023 	&jz	(&label("ctr32_ret"));
   1024 	&movdqu	($inout5,&QWP(0,$key_));
   1025 	&mov	($key,$key_);
   1026 	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
   1027 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1028 
   1029 &set_label("ctr32_tail");
   1030 	&por	($inout0,$inout5);
   1031 	&cmp	($len,2);
   1032 	&jb	(&label("ctr32_one"));
   1033 
   1034 	&pshufd	($inout2,$rndkey0,1<<6);
   1035 	&por	($inout1,$inout5);
   1036 	&je	(&label("ctr32_two"));
   1037 
   1038 	&pshufd	($inout3,$rndkey1,3<<6);
   1039 	&por	($inout2,$inout5);
   1040 	&cmp	($len,4);
   1041 	&jb	(&label("ctr32_three"));
   1042 
   1043 	&pshufd	($inout4,$rndkey1,2<<6);
   1044 	&por	($inout3,$inout5);
   1045 	&je	(&label("ctr32_four"));
   1046 
   1047 	&por	($inout4,$inout5);
   1048 	&call	("_aesni_encrypt6");
   1049 	&movups	($rndkey1,&QWP(0,$inp));
   1050 	&movups	($rndkey0,&QWP(0x10,$inp));
   1051 	&xorps	($inout0,$rndkey1);
   1052 	&movups	($rndkey1,&QWP(0x20,$inp));
   1053 	&xorps	($inout1,$rndkey0);
   1054 	&movups	($rndkey0,&QWP(0x30,$inp));
   1055 	&xorps	($inout2,$rndkey1);
   1056 	&movups	($rndkey1,&QWP(0x40,$inp));
   1057 	&xorps	($inout3,$rndkey0);
   1058 	&movups	(&QWP(0,$out),$inout0);
   1059 	&xorps	($inout4,$rndkey1);
   1060 	&movups	(&QWP(0x10,$out),$inout1);
   1061 	&movups	(&QWP(0x20,$out),$inout2);
   1062 	&movups	(&QWP(0x30,$out),$inout3);
   1063 	&movups	(&QWP(0x40,$out),$inout4);
   1064 	&jmp	(&label("ctr32_ret"));
   1065 
   1066 &set_label("ctr32_one_shortcut",16);
   1067 	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
   1068 	&mov	($rounds,&DWP(240,$key));
   1069 
   1070 &set_label("ctr32_one");
   1071 	if ($inline)
   1072 	{   &aesni_inline_generate1("enc");	}
   1073 	else
   1074 	{   &call	("_aesni_encrypt1");	}
   1075 	&movups	($in0,&QWP(0,$inp));
   1076 	&xorps	($in0,$inout0);
   1077 	&movups	(&QWP(0,$out),$in0);
   1078 	&jmp	(&label("ctr32_ret"));
   1079 
   1080 &set_label("ctr32_two",16);
   1081 	&call	("_aesni_encrypt2");
   1082 	&movups	($inout3,&QWP(0,$inp));
   1083 	&movups	($inout4,&QWP(0x10,$inp));
   1084 	&xorps	($inout0,$inout3);
   1085 	&xorps	($inout1,$inout4);
   1086 	&movups	(&QWP(0,$out),$inout0);
   1087 	&movups	(&QWP(0x10,$out),$inout1);
   1088 	&jmp	(&label("ctr32_ret"));
   1089 
   1090 &set_label("ctr32_three",16);
   1091 	&call	("_aesni_encrypt3");
   1092 	&movups	($inout3,&QWP(0,$inp));
   1093 	&movups	($inout4,&QWP(0x10,$inp));
   1094 	&xorps	($inout0,$inout3);
   1095 	&movups	($inout5,&QWP(0x20,$inp));
   1096 	&xorps	($inout1,$inout4);
   1097 	&movups	(&QWP(0,$out),$inout0);
   1098 	&xorps	($inout2,$inout5);
   1099 	&movups	(&QWP(0x10,$out),$inout1);
   1100 	&movups	(&QWP(0x20,$out),$inout2);
   1101 	&jmp	(&label("ctr32_ret"));
   1102 
   1103 &set_label("ctr32_four",16);
   1104 	&call	("_aesni_encrypt4");
   1105 	&movups	($inout4,&QWP(0,$inp));
   1106 	&movups	($inout5,&QWP(0x10,$inp));
   1107 	&movups	($rndkey1,&QWP(0x20,$inp));
   1108 	&xorps	($inout0,$inout4);
   1109 	&movups	($rndkey0,&QWP(0x30,$inp));
   1110 	&xorps	($inout1,$inout5);
   1111 	&movups	(&QWP(0,$out),$inout0);
   1112 	&xorps	($inout2,$rndkey1);
   1113 	&movups	(&QWP(0x10,$out),$inout1);
   1114 	&xorps	($inout3,$rndkey0);
   1115 	&movups	(&QWP(0x20,$out),$inout2);
   1116 	&movups	(&QWP(0x30,$out),$inout3);
   1117 
   1118 &set_label("ctr32_ret");
   1119 	&pxor	("xmm0","xmm0");		# clear register bank
   1120 	&pxor	("xmm1","xmm1");
   1121 	&pxor	("xmm2","xmm2");
   1122 	&pxor	("xmm3","xmm3");
   1123 	&pxor	("xmm4","xmm4");
   1124 	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
   1125 	&pxor	("xmm5","xmm5");
   1126 	&movdqa	(&QWP(48,"esp"),"xmm0");
   1127 	&pxor	("xmm6","xmm6");
   1128 	&movdqa	(&QWP(64,"esp"),"xmm0");
   1129 	&pxor	("xmm7","xmm7");
   1130 	&mov	("esp",&DWP(80,"esp"));
   1131 &function_end("${PREFIX}_ctr32_encrypt_blocks");
   1132 
   1134 ######################################################################
   1135 # void aes_hw_xts_[en|de]crypt(const char *inp,char *out,size_t len,
   1136 #	const AES_KEY *key1, const AES_KEY *key2
   1137 #	const unsigned char iv[16]);
   1138 #
   1139 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
   1140 
   1141 &function_begin("${PREFIX}_xts_encrypt");
   1142 	&mov	($key,&wparam(4));		# key2
   1143 	&mov	($inp,&wparam(5));		# clear-text tweak
   1144 
   1145 	&mov	($rounds,&DWP(240,$key));	# key2->rounds
   1146 	&movups	($inout0,&QWP(0,$inp));
   1147 	if ($inline)
   1148 	{   &aesni_inline_generate1("enc");	}
   1149 	else
   1150 	{   &call	("_aesni_encrypt1");	}
   1151 
   1152 	&mov	($inp,&wparam(0));
   1153 	&mov	($out,&wparam(1));
   1154 	&mov	($len,&wparam(2));
   1155 	&mov	($key,&wparam(3));		# key1
   1156 
   1157 	&mov	($key_,"esp");
   1158 	&sub	("esp",16*7+8);
   1159 	&mov	($rounds,&DWP(240,$key));	# key1->rounds
   1160 	&and	("esp",-16);			# align stack
   1161 
   1162 	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
   1163 	&mov	(&DWP(16*6+4,"esp"),0);
   1164 	&mov	(&DWP(16*6+8,"esp"),1);
   1165 	&mov	(&DWP(16*6+12,"esp"),0);
   1166 	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
   1167 	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
   1168 
   1169 	&movdqa	($tweak,$inout0);
   1170 	&pxor	($twtmp,$twtmp);
   1171 	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
   1172 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1173 
   1174 	&and	($len,-16);
   1175 	&mov	($key_,$key);			# backup $key
   1176 	&mov	($rounds_,$rounds);		# backup $rounds
   1177 	&sub	($len,16*6);
   1178 	&jc	(&label("xts_enc_short"));
   1179 
   1180 	&shl	($rounds,4);
   1181 	&mov	($rounds_,16);
   1182 	&sub	($rounds_,$rounds);
   1183 	&lea	($key,&DWP(32,$key,$rounds));
   1184 	&jmp	(&label("xts_enc_loop6"));
   1185 
   1186 &set_label("xts_enc_loop6",16);
   1187 	for ($i=0;$i<4;$i++) {
   1188 	    &pshufd	($twres,$twtmp,0x13);
   1189 	    &pxor	($twtmp,$twtmp);
   1190 	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
   1191 	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
   1192 	    &pand	($twres,$twmask);	# isolate carry and residue
   1193 	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
   1194 	    &pxor	($tweak,$twres);
   1195 	}
   1196 	&pshufd	($inout5,$twtmp,0x13);
   1197 	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
   1198 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1199 	 &$movekey	($rndkey0,&QWP(0,$key_));
   1200 	&pand	($inout5,$twmask);		# isolate carry and residue
   1201 	 &movups	($inout0,&QWP(0,$inp));	# load input
   1202 	&pxor	($inout5,$tweak);
   1203 
   1204 	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
   1205 	&mov	($rounds,$rounds_);		# restore $rounds
   1206 	&movdqu	($inout1,&QWP(16*1,$inp));
   1207 	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
   1208 	&movdqu	($inout2,&QWP(16*2,$inp));
   1209 	 &pxor		($inout1,$rndkey0);
   1210 	&movdqu	($inout3,&QWP(16*3,$inp));
   1211 	 &pxor		($inout2,$rndkey0);
   1212 	&movdqu	($inout4,&QWP(16*4,$inp));
   1213 	 &pxor		($inout3,$rndkey0);
   1214 	&movdqu	($rndkey1,&QWP(16*5,$inp));
   1215 	 &pxor		($inout4,$rndkey0);
   1216 	&lea	($inp,&DWP(16*6,$inp));
   1217 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1218 	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
   1219 	&pxor	($inout5,$rndkey1);
   1220 
   1221 	 &$movekey	($rndkey1,&QWP(16,$key_));
   1222 	&pxor	($inout1,&QWP(16*1,"esp"));
   1223 	&pxor	($inout2,&QWP(16*2,"esp"));
   1224 	 &aesenc	($inout0,$rndkey1);
   1225 	&pxor	($inout3,&QWP(16*3,"esp"));
   1226 	&pxor	($inout4,&QWP(16*4,"esp"));
   1227 	 &aesenc	($inout1,$rndkey1);
   1228 	&pxor		($inout5,$rndkey0);
   1229 	 &$movekey	($rndkey0,&QWP(32,$key_));
   1230 	 &aesenc	($inout2,$rndkey1);
   1231 	 &aesenc	($inout3,$rndkey1);
   1232 	 &aesenc	($inout4,$rndkey1);
   1233 	 &aesenc	($inout5,$rndkey1);
   1234 	&call		(&label("_aesni_encrypt6_enter"));
   1235 
   1236 	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
   1237        &pxor	($twtmp,$twtmp);
   1238 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1239        &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
   1240 	&xorps	($inout1,&QWP(16*1,"esp"));
   1241 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1242 	&xorps	($inout2,&QWP(16*2,"esp"));
   1243 	&movups	(&QWP(16*1,$out),$inout1);
   1244 	&xorps	($inout3,&QWP(16*3,"esp"));
   1245 	&movups	(&QWP(16*2,$out),$inout2);
   1246 	&xorps	($inout4,&QWP(16*4,"esp"));
   1247 	&movups	(&QWP(16*3,$out),$inout3);
   1248 	&xorps	($inout5,$tweak);
   1249 	&movups	(&QWP(16*4,$out),$inout4);
   1250        &pshufd	($twres,$twtmp,0x13);
   1251 	&movups	(&QWP(16*5,$out),$inout5);
   1252 	&lea	($out,&DWP(16*6,$out));
   1253        &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
   1254 
   1255 	&pxor	($twtmp,$twtmp);
   1256 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1257 	&pand	($twres,$twmask);		# isolate carry and residue
   1258 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1259 	&pxor	($tweak,$twres);
   1260 
   1261 	&sub	($len,16*6);
   1262 	&jnc	(&label("xts_enc_loop6"));
   1263 
   1264 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1265 	&mov	($key,$key_);			# restore $key
   1266 	&mov	($rounds_,$rounds);
   1267 
   1268 &set_label("xts_enc_short");
   1269 	&add	($len,16*6);
   1270 	&jz	(&label("xts_enc_done6x"));
   1271 
   1272 	&movdqa	($inout3,$tweak);		# put aside previous tweak
   1273 	&cmp	($len,0x20);
   1274 	&jb	(&label("xts_enc_one"));
   1275 
   1276 	&pshufd	($twres,$twtmp,0x13);
   1277 	&pxor	($twtmp,$twtmp);
   1278 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1279 	&pand	($twres,$twmask);		# isolate carry and residue
   1280 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1281 	&pxor	($tweak,$twres);
   1282 	&je	(&label("xts_enc_two"));
   1283 
   1284 	&pshufd	($twres,$twtmp,0x13);
   1285 	&pxor	($twtmp,$twtmp);
   1286 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1287 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1288 	&pand	($twres,$twmask);		# isolate carry and residue
   1289 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1290 	&pxor	($tweak,$twres);
   1291 	&cmp	($len,0x40);
   1292 	&jb	(&label("xts_enc_three"));
   1293 
   1294 	&pshufd	($twres,$twtmp,0x13);
   1295 	&pxor	($twtmp,$twtmp);
   1296 	&movdqa	($inout5,$tweak);		# put aside previous tweak
   1297 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1298 	&pand	($twres,$twmask);		# isolate carry and residue
   1299 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1300 	&pxor	($tweak,$twres);
   1301 	&movdqa	(&QWP(16*0,"esp"),$inout3);
   1302 	&movdqa	(&QWP(16*1,"esp"),$inout4);
   1303 	&je	(&label("xts_enc_four"));
   1304 
   1305 	&movdqa	(&QWP(16*2,"esp"),$inout5);
   1306 	&pshufd	($inout5,$twtmp,0x13);
   1307 	&movdqa	(&QWP(16*3,"esp"),$tweak);
   1308 	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
   1309 	&pand	($inout5,$twmask);		# isolate carry and residue
   1310 	&pxor	($inout5,$tweak);
   1311 
   1312 	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
   1313 	&movdqu	($inout1,&QWP(16*1,$inp));
   1314 	&movdqu	($inout2,&QWP(16*2,$inp));
   1315 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1316 	&movdqu	($inout3,&QWP(16*3,$inp));
   1317 	&pxor	($inout1,&QWP(16*1,"esp"));
   1318 	&movdqu	($inout4,&QWP(16*4,$inp));
   1319 	&pxor	($inout2,&QWP(16*2,"esp"));
   1320 	&lea	($inp,&DWP(16*5,$inp));
   1321 	&pxor	($inout3,&QWP(16*3,"esp"));
   1322 	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
   1323 	&pxor	($inout4,$inout5);
   1324 
   1325 	&call	("_aesni_encrypt6");
   1326 
   1327 	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
   1328 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1329 	&xorps	($inout1,&QWP(16*1,"esp"));
   1330 	&xorps	($inout2,&QWP(16*2,"esp"));
   1331 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1332 	&xorps	($inout3,&QWP(16*3,"esp"));
   1333 	&movups	(&QWP(16*1,$out),$inout1);
   1334 	&xorps	($inout4,$tweak);
   1335 	&movups	(&QWP(16*2,$out),$inout2);
   1336 	&movups	(&QWP(16*3,$out),$inout3);
   1337 	&movups	(&QWP(16*4,$out),$inout4);
   1338 	&lea	($out,&DWP(16*5,$out));
   1339 	&jmp	(&label("xts_enc_done"));
   1340 
   1341 &set_label("xts_enc_one",16);
   1342 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1343 	&lea	($inp,&DWP(16*1,$inp));
   1344 	&xorps	($inout0,$inout3);		# input^=tweak
   1345 	if ($inline)
   1346 	{   &aesni_inline_generate1("enc");	}
   1347 	else
   1348 	{   &call	("_aesni_encrypt1");	}
   1349 	&xorps	($inout0,$inout3);		# output^=tweak
   1350 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1351 	&lea	($out,&DWP(16*1,$out));
   1352 
   1353 	&movdqa	($tweak,$inout3);		# last tweak
   1354 	&jmp	(&label("xts_enc_done"));
   1355 
   1356 &set_label("xts_enc_two",16);
   1357 	&movaps	($inout4,$tweak);		# put aside last tweak
   1358 
   1359 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1360 	&movups	($inout1,&QWP(16*1,$inp));
   1361 	&lea	($inp,&DWP(16*2,$inp));
   1362 	&xorps	($inout0,$inout3);		# input^=tweak
   1363 	&xorps	($inout1,$inout4);
   1364 
   1365 	&call	("_aesni_encrypt2");
   1366 
   1367 	&xorps	($inout0,$inout3);		# output^=tweak
   1368 	&xorps	($inout1,$inout4);
   1369 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1370 	&movups	(&QWP(16*1,$out),$inout1);
   1371 	&lea	($out,&DWP(16*2,$out));
   1372 
   1373 	&movdqa	($tweak,$inout4);		# last tweak
   1374 	&jmp	(&label("xts_enc_done"));
   1375 
   1376 &set_label("xts_enc_three",16);
   1377 	&movaps	($inout5,$tweak);		# put aside last tweak
   1378 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1379 	&movups	($inout1,&QWP(16*1,$inp));
   1380 	&movups	($inout2,&QWP(16*2,$inp));
   1381 	&lea	($inp,&DWP(16*3,$inp));
   1382 	&xorps	($inout0,$inout3);		# input^=tweak
   1383 	&xorps	($inout1,$inout4);
   1384 	&xorps	($inout2,$inout5);
   1385 
   1386 	&call	("_aesni_encrypt3");
   1387 
   1388 	&xorps	($inout0,$inout3);		# output^=tweak
   1389 	&xorps	($inout1,$inout4);
   1390 	&xorps	($inout2,$inout5);
   1391 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1392 	&movups	(&QWP(16*1,$out),$inout1);
   1393 	&movups	(&QWP(16*2,$out),$inout2);
   1394 	&lea	($out,&DWP(16*3,$out));
   1395 
   1396 	&movdqa	($tweak,$inout5);		# last tweak
   1397 	&jmp	(&label("xts_enc_done"));
   1398 
   1399 &set_label("xts_enc_four",16);
   1400 	&movaps	($inout4,$tweak);		# put aside last tweak
   1401 
   1402 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1403 	&movups	($inout1,&QWP(16*1,$inp));
   1404 	&movups	($inout2,&QWP(16*2,$inp));
   1405 	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1406 	&movups	($inout3,&QWP(16*3,$inp));
   1407 	&lea	($inp,&DWP(16*4,$inp));
   1408 	&xorps	($inout1,&QWP(16*1,"esp"));
   1409 	&xorps	($inout2,$inout5);
   1410 	&xorps	($inout3,$inout4);
   1411 
   1412 	&call	("_aesni_encrypt4");
   1413 
   1414 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1415 	&xorps	($inout1,&QWP(16*1,"esp"));
   1416 	&xorps	($inout2,$inout5);
   1417 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1418 	&xorps	($inout3,$inout4);
   1419 	&movups	(&QWP(16*1,$out),$inout1);
   1420 	&movups	(&QWP(16*2,$out),$inout2);
   1421 	&movups	(&QWP(16*3,$out),$inout3);
   1422 	&lea	($out,&DWP(16*4,$out));
   1423 
   1424 	&movdqa	($tweak,$inout4);		# last tweak
   1425 	&jmp	(&label("xts_enc_done"));
   1426 
   1427 &set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
   1428 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1429 	&and	($len,15);
   1430 	&jz	(&label("xts_enc_ret"));
   1431 	&movdqa	($inout3,$tweak);
   1432 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1433 	&jmp	(&label("xts_enc_steal"));
   1434 
   1435 &set_label("xts_enc_done",16);
   1436 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1437 	&pxor	($twtmp,$twtmp);
   1438 	&and	($len,15);
   1439 	&jz	(&label("xts_enc_ret"));
   1440 
   1441 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1442 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1443 	&pshufd	($inout3,$twtmp,0x13);
   1444 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1445 	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
   1446 	&pxor	($inout3,$tweak);
   1447 
   1448 &set_label("xts_enc_steal");
   1449 	&movz	($rounds,&BP(0,$inp));
   1450 	&movz	($key,&BP(-16,$out));
   1451 	&lea	($inp,&DWP(1,$inp));
   1452 	&mov	(&BP(-16,$out),&LB($rounds));
   1453 	&mov	(&BP(0,$out),&LB($key));
   1454 	&lea	($out,&DWP(1,$out));
   1455 	&sub	($len,1);
   1456 	&jnz	(&label("xts_enc_steal"));
   1457 
   1458 	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
   1459 	&mov	($key,$key_);			# restore $key
   1460 	&mov	($rounds,$rounds_);		# restore $rounds
   1461 
   1462 	&movups	($inout0,&QWP(-16,$out));	# load input
   1463 	&xorps	($inout0,$inout3);		# input^=tweak
   1464 	if ($inline)
   1465 	{   &aesni_inline_generate1("enc");	}
   1466 	else
   1467 	{   &call	("_aesni_encrypt1");	}
   1468 	&xorps	($inout0,$inout3);		# output^=tweak
   1469 	&movups	(&QWP(-16,$out),$inout0);	# write output
   1470 
   1471 &set_label("xts_enc_ret");
   1472 	&pxor	("xmm0","xmm0");		# clear register bank
   1473 	&pxor	("xmm1","xmm1");
   1474 	&pxor	("xmm2","xmm2");
   1475 	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
   1476 	&pxor	("xmm3","xmm3");
   1477 	&movdqa	(&QWP(16*1,"esp"),"xmm0");
   1478 	&pxor	("xmm4","xmm4");
   1479 	&movdqa	(&QWP(16*2,"esp"),"xmm0");
   1480 	&pxor	("xmm5","xmm5");
   1481 	&movdqa	(&QWP(16*3,"esp"),"xmm0");
   1482 	&pxor	("xmm6","xmm6");
   1483 	&movdqa	(&QWP(16*4,"esp"),"xmm0");
   1484 	&pxor	("xmm7","xmm7");
   1485 	&movdqa	(&QWP(16*5,"esp"),"xmm0");
   1486 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
   1487 &function_end("${PREFIX}_xts_encrypt");
   1488 
   1489 &function_begin("${PREFIX}_xts_decrypt");
   1490 	&mov	($key,&wparam(4));		# key2
   1491 	&mov	($inp,&wparam(5));		# clear-text tweak
   1492 
   1493 	&mov	($rounds,&DWP(240,$key));	# key2->rounds
   1494 	&movups	($inout0,&QWP(0,$inp));
   1495 	if ($inline)
   1496 	{   &aesni_inline_generate1("enc");	}
   1497 	else
   1498 	{   &call	("_aesni_encrypt1");	}
   1499 
   1500 	&mov	($inp,&wparam(0));
   1501 	&mov	($out,&wparam(1));
   1502 	&mov	($len,&wparam(2));
   1503 	&mov	($key,&wparam(3));		# key1
   1504 
   1505 	&mov	($key_,"esp");
   1506 	&sub	("esp",16*7+8);
   1507 	&and	("esp",-16);			# align stack
   1508 
   1509 	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
   1510 	&test	($len,15);
   1511 	&setnz	(&LB($rounds_));
   1512 	&shl	($rounds_,4);
   1513 	&sub	($len,$rounds_);
   1514 
   1515 	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
   1516 	&mov	(&DWP(16*6+4,"esp"),0);
   1517 	&mov	(&DWP(16*6+8,"esp"),1);
   1518 	&mov	(&DWP(16*6+12,"esp"),0);
   1519 	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
   1520 	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
   1521 
   1522 	&mov	($rounds,&DWP(240,$key));	# key1->rounds
   1523 	&mov	($key_,$key);			# backup $key
   1524 	&mov	($rounds_,$rounds);		# backup $rounds
   1525 
   1526 	&movdqa	($tweak,$inout0);
   1527 	&pxor	($twtmp,$twtmp);
   1528 	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
   1529 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1530 
   1531 	&and	($len,-16);
   1532 	&sub	($len,16*6);
   1533 	&jc	(&label("xts_dec_short"));
   1534 
   1535 	&shl	($rounds,4);
   1536 	&mov	($rounds_,16);
   1537 	&sub	($rounds_,$rounds);
   1538 	&lea	($key,&DWP(32,$key,$rounds));
   1539 	&jmp	(&label("xts_dec_loop6"));
   1540 
   1541 &set_label("xts_dec_loop6",16);
   1542 	for ($i=0;$i<4;$i++) {
   1543 	    &pshufd	($twres,$twtmp,0x13);
   1544 	    &pxor	($twtmp,$twtmp);
   1545 	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
   1546 	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
   1547 	    &pand	($twres,$twmask);	# isolate carry and residue
   1548 	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
   1549 	    &pxor	($tweak,$twres);
   1550 	}
   1551 	&pshufd	($inout5,$twtmp,0x13);
   1552 	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
   1553 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1554 	 &$movekey	($rndkey0,&QWP(0,$key_));
   1555 	&pand	($inout5,$twmask);		# isolate carry and residue
   1556 	 &movups	($inout0,&QWP(0,$inp));	# load input
   1557 	&pxor	($inout5,$tweak);
   1558 
   1559 	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
   1560 	&mov	($rounds,$rounds_);
   1561 	&movdqu	($inout1,&QWP(16*1,$inp));
   1562 	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
   1563 	&movdqu	($inout2,&QWP(16*2,$inp));
   1564 	 &pxor		($inout1,$rndkey0);
   1565 	&movdqu	($inout3,&QWP(16*3,$inp));
   1566 	 &pxor		($inout2,$rndkey0);
   1567 	&movdqu	($inout4,&QWP(16*4,$inp));
   1568 	 &pxor		($inout3,$rndkey0);
   1569 	&movdqu	($rndkey1,&QWP(16*5,$inp));
   1570 	 &pxor		($inout4,$rndkey0);
   1571 	&lea	($inp,&DWP(16*6,$inp));
   1572 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1573 	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
   1574 	&pxor	($inout5,$rndkey1);
   1575 
   1576 	 &$movekey	($rndkey1,&QWP(16,$key_));
   1577 	&pxor	($inout1,&QWP(16*1,"esp"));
   1578 	&pxor	($inout2,&QWP(16*2,"esp"));
   1579 	 &aesdec	($inout0,$rndkey1);
   1580 	&pxor	($inout3,&QWP(16*3,"esp"));
   1581 	&pxor	($inout4,&QWP(16*4,"esp"));
   1582 	 &aesdec	($inout1,$rndkey1);
   1583 	&pxor		($inout5,$rndkey0);
   1584 	 &$movekey	($rndkey0,&QWP(32,$key_));
   1585 	 &aesdec	($inout2,$rndkey1);
   1586 	 &aesdec	($inout3,$rndkey1);
   1587 	 &aesdec	($inout4,$rndkey1);
   1588 	 &aesdec	($inout5,$rndkey1);
   1589 	&call		(&label("_aesni_decrypt6_enter"));
   1590 
   1591 	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
   1592        &pxor	($twtmp,$twtmp);
   1593 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1594        &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
   1595 	&xorps	($inout1,&QWP(16*1,"esp"));
   1596 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1597 	&xorps	($inout2,&QWP(16*2,"esp"));
   1598 	&movups	(&QWP(16*1,$out),$inout1);
   1599 	&xorps	($inout3,&QWP(16*3,"esp"));
   1600 	&movups	(&QWP(16*2,$out),$inout2);
   1601 	&xorps	($inout4,&QWP(16*4,"esp"));
   1602 	&movups	(&QWP(16*3,$out),$inout3);
   1603 	&xorps	($inout5,$tweak);
   1604 	&movups	(&QWP(16*4,$out),$inout4);
   1605        &pshufd	($twres,$twtmp,0x13);
   1606 	&movups	(&QWP(16*5,$out),$inout5);
   1607 	&lea	($out,&DWP(16*6,$out));
   1608        &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
   1609 
   1610 	&pxor	($twtmp,$twtmp);
   1611 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1612 	&pand	($twres,$twmask);		# isolate carry and residue
   1613 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1614 	&pxor	($tweak,$twres);
   1615 
   1616 	&sub	($len,16*6);
   1617 	&jnc	(&label("xts_dec_loop6"));
   1618 
   1619 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1620 	&mov	($key,$key_);			# restore $key
   1621 	&mov	($rounds_,$rounds);
   1622 
   1623 &set_label("xts_dec_short");
   1624 	&add	($len,16*6);
   1625 	&jz	(&label("xts_dec_done6x"));
   1626 
   1627 	&movdqa	($inout3,$tweak);		# put aside previous tweak
   1628 	&cmp	($len,0x20);
   1629 	&jb	(&label("xts_dec_one"));
   1630 
   1631 	&pshufd	($twres,$twtmp,0x13);
   1632 	&pxor	($twtmp,$twtmp);
   1633 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1634 	&pand	($twres,$twmask);		# isolate carry and residue
   1635 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1636 	&pxor	($tweak,$twres);
   1637 	&je	(&label("xts_dec_two"));
   1638 
   1639 	&pshufd	($twres,$twtmp,0x13);
   1640 	&pxor	($twtmp,$twtmp);
   1641 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1642 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1643 	&pand	($twres,$twmask);		# isolate carry and residue
   1644 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1645 	&pxor	($tweak,$twres);
   1646 	&cmp	($len,0x40);
   1647 	&jb	(&label("xts_dec_three"));
   1648 
   1649 	&pshufd	($twres,$twtmp,0x13);
   1650 	&pxor	($twtmp,$twtmp);
   1651 	&movdqa	($inout5,$tweak);		# put aside previous tweak
   1652 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1653 	&pand	($twres,$twmask);		# isolate carry and residue
   1654 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1655 	&pxor	($tweak,$twres);
   1656 	&movdqa	(&QWP(16*0,"esp"),$inout3);
   1657 	&movdqa	(&QWP(16*1,"esp"),$inout4);
   1658 	&je	(&label("xts_dec_four"));
   1659 
   1660 	&movdqa	(&QWP(16*2,"esp"),$inout5);
   1661 	&pshufd	($inout5,$twtmp,0x13);
   1662 	&movdqa	(&QWP(16*3,"esp"),$tweak);
   1663 	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
   1664 	&pand	($inout5,$twmask);		# isolate carry and residue
   1665 	&pxor	($inout5,$tweak);
   1666 
   1667 	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
   1668 	&movdqu	($inout1,&QWP(16*1,$inp));
   1669 	&movdqu	($inout2,&QWP(16*2,$inp));
   1670 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1671 	&movdqu	($inout3,&QWP(16*3,$inp));
   1672 	&pxor	($inout1,&QWP(16*1,"esp"));
   1673 	&movdqu	($inout4,&QWP(16*4,$inp));
   1674 	&pxor	($inout2,&QWP(16*2,"esp"));
   1675 	&lea	($inp,&DWP(16*5,$inp));
   1676 	&pxor	($inout3,&QWP(16*3,"esp"));
   1677 	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
   1678 	&pxor	($inout4,$inout5);
   1679 
   1680 	&call	("_aesni_decrypt6");
   1681 
   1682 	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
   1683 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1684 	&xorps	($inout1,&QWP(16*1,"esp"));
   1685 	&xorps	($inout2,&QWP(16*2,"esp"));
   1686 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1687 	&xorps	($inout3,&QWP(16*3,"esp"));
   1688 	&movups	(&QWP(16*1,$out),$inout1);
   1689 	&xorps	($inout4,$tweak);
   1690 	&movups	(&QWP(16*2,$out),$inout2);
   1691 	&movups	(&QWP(16*3,$out),$inout3);
   1692 	&movups	(&QWP(16*4,$out),$inout4);
   1693 	&lea	($out,&DWP(16*5,$out));
   1694 	&jmp	(&label("xts_dec_done"));
   1695 
   1696 &set_label("xts_dec_one",16);
   1697 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1698 	&lea	($inp,&DWP(16*1,$inp));
   1699 	&xorps	($inout0,$inout3);		# input^=tweak
   1700 	if ($inline)
   1701 	{   &aesni_inline_generate1("dec");	}
   1702 	else
   1703 	{   &call	("_aesni_decrypt1");	}
   1704 	&xorps	($inout0,$inout3);		# output^=tweak
   1705 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1706 	&lea	($out,&DWP(16*1,$out));
   1707 
   1708 	&movdqa	($tweak,$inout3);		# last tweak
   1709 	&jmp	(&label("xts_dec_done"));
   1710 
   1711 &set_label("xts_dec_two",16);
   1712 	&movaps	($inout4,$tweak);		# put aside last tweak
   1713 
   1714 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1715 	&movups	($inout1,&QWP(16*1,$inp));
   1716 	&lea	($inp,&DWP(16*2,$inp));
   1717 	&xorps	($inout0,$inout3);		# input^=tweak
   1718 	&xorps	($inout1,$inout4);
   1719 
   1720 	&call	("_aesni_decrypt2");
   1721 
   1722 	&xorps	($inout0,$inout3);		# output^=tweak
   1723 	&xorps	($inout1,$inout4);
   1724 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1725 	&movups	(&QWP(16*1,$out),$inout1);
   1726 	&lea	($out,&DWP(16*2,$out));
   1727 
   1728 	&movdqa	($tweak,$inout4);		# last tweak
   1729 	&jmp	(&label("xts_dec_done"));
   1730 
   1731 &set_label("xts_dec_three",16);
   1732 	&movaps	($inout5,$tweak);		# put aside last tweak
   1733 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1734 	&movups	($inout1,&QWP(16*1,$inp));
   1735 	&movups	($inout2,&QWP(16*2,$inp));
   1736 	&lea	($inp,&DWP(16*3,$inp));
   1737 	&xorps	($inout0,$inout3);		# input^=tweak
   1738 	&xorps	($inout1,$inout4);
   1739 	&xorps	($inout2,$inout5);
   1740 
   1741 	&call	("_aesni_decrypt3");
   1742 
   1743 	&xorps	($inout0,$inout3);		# output^=tweak
   1744 	&xorps	($inout1,$inout4);
   1745 	&xorps	($inout2,$inout5);
   1746 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1747 	&movups	(&QWP(16*1,$out),$inout1);
   1748 	&movups	(&QWP(16*2,$out),$inout2);
   1749 	&lea	($out,&DWP(16*3,$out));
   1750 
   1751 	&movdqa	($tweak,$inout5);		# last tweak
   1752 	&jmp	(&label("xts_dec_done"));
   1753 
   1754 &set_label("xts_dec_four",16);
   1755 	&movaps	($inout4,$tweak);		# put aside last tweak
   1756 
   1757 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1758 	&movups	($inout1,&QWP(16*1,$inp));
   1759 	&movups	($inout2,&QWP(16*2,$inp));
   1760 	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1761 	&movups	($inout3,&QWP(16*3,$inp));
   1762 	&lea	($inp,&DWP(16*4,$inp));
   1763 	&xorps	($inout1,&QWP(16*1,"esp"));
   1764 	&xorps	($inout2,$inout5);
   1765 	&xorps	($inout3,$inout4);
   1766 
   1767 	&call	("_aesni_decrypt4");
   1768 
   1769 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1770 	&xorps	($inout1,&QWP(16*1,"esp"));
   1771 	&xorps	($inout2,$inout5);
   1772 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1773 	&xorps	($inout3,$inout4);
   1774 	&movups	(&QWP(16*1,$out),$inout1);
   1775 	&movups	(&QWP(16*2,$out),$inout2);
   1776 	&movups	(&QWP(16*3,$out),$inout3);
   1777 	&lea	($out,&DWP(16*4,$out));
   1778 
   1779 	&movdqa	($tweak,$inout4);		# last tweak
   1780 	&jmp	(&label("xts_dec_done"));
   1781 
   1782 &set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
   1783 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1784 	&and	($len,15);
   1785 	&jz	(&label("xts_dec_ret"));
   1786 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1787 	&jmp	(&label("xts_dec_only_one_more"));
   1788 
   1789 &set_label("xts_dec_done",16);
   1790 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1791 	&pxor	($twtmp,$twtmp);
   1792 	&and	($len,15);
   1793 	&jz	(&label("xts_dec_ret"));
   1794 
   1795 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1796 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1797 	&pshufd	($twres,$twtmp,0x13);
   1798 	&pxor	($twtmp,$twtmp);
   1799 	&movdqa	($twmask,&QWP(16*6,"esp"));
   1800 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1801 	&pand	($twres,$twmask);		# isolate carry and residue
   1802 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1803 	&pxor	($tweak,$twres);
   1804 
   1805 &set_label("xts_dec_only_one_more");
   1806 	&pshufd	($inout3,$twtmp,0x13);
   1807 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1808 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1809 	&pand	($inout3,$twmask);		# isolate carry and residue
   1810 	&pxor	($inout3,$tweak);
   1811 
   1812 	&mov	($key,$key_);			# restore $key
   1813 	&mov	($rounds,$rounds_);		# restore $rounds
   1814 
   1815 	&movups	($inout0,&QWP(0,$inp));		# load input
   1816 	&xorps	($inout0,$inout3);		# input^=tweak
   1817 	if ($inline)
   1818 	{   &aesni_inline_generate1("dec");	}
   1819 	else
   1820 	{   &call	("_aesni_decrypt1");	}
   1821 	&xorps	($inout0,$inout3);		# output^=tweak
   1822 	&movups	(&QWP(0,$out),$inout0);		# write output
   1823 
   1824 &set_label("xts_dec_steal");
   1825 	&movz	($rounds,&BP(16,$inp));
   1826 	&movz	($key,&BP(0,$out));
   1827 	&lea	($inp,&DWP(1,$inp));
   1828 	&mov	(&BP(0,$out),&LB($rounds));
   1829 	&mov	(&BP(16,$out),&LB($key));
   1830 	&lea	($out,&DWP(1,$out));
   1831 	&sub	($len,1);
   1832 	&jnz	(&label("xts_dec_steal"));
   1833 
   1834 	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
   1835 	&mov	($key,$key_);			# restore $key
   1836 	&mov	($rounds,$rounds_);		# restore $rounds
   1837 
   1838 	&movups	($inout0,&QWP(0,$out));		# load input
   1839 	&xorps	($inout0,$inout4);		# input^=tweak
   1840 	if ($inline)
   1841 	{   &aesni_inline_generate1("dec");	}
   1842 	else
   1843 	{   &call	("_aesni_decrypt1");	}
   1844 	&xorps	($inout0,$inout4);		# output^=tweak
   1845 	&movups	(&QWP(0,$out),$inout0);		# write output
   1846 
   1847 &set_label("xts_dec_ret");
   1848 	&pxor	("xmm0","xmm0");		# clear register bank
   1849 	&pxor	("xmm1","xmm1");
   1850 	&pxor	("xmm2","xmm2");
   1851 	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
   1852 	&pxor	("xmm3","xmm3");
   1853 	&movdqa	(&QWP(16*1,"esp"),"xmm0");
   1854 	&pxor	("xmm4","xmm4");
   1855 	&movdqa	(&QWP(16*2,"esp"),"xmm0");
   1856 	&pxor	("xmm5","xmm5");
   1857 	&movdqa	(&QWP(16*3,"esp"),"xmm0");
   1858 	&pxor	("xmm6","xmm6");
   1859 	&movdqa	(&QWP(16*4,"esp"),"xmm0");
   1860 	&pxor	("xmm7","xmm7");
   1861 	&movdqa	(&QWP(16*5,"esp"),"xmm0");
   1862 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
   1863 &function_end("${PREFIX}_xts_decrypt");
   1864 }
   1865 }
   1866 
   1868 ######################################################################
   1869 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
   1870 #                           size_t length, const AES_KEY *key,
   1871 #                           unsigned char *ivp,const int enc);
   1872 &function_begin("${PREFIX}_cbc_encrypt");
   1873 	&mov	($inp,&wparam(0));
   1874 	&mov	($rounds_,"esp");
   1875 	&mov	($out,&wparam(1));
   1876 	&sub	($rounds_,24);
   1877 	&mov	($len,&wparam(2));
   1878 	&and	($rounds_,-16);
   1879 	&mov	($key,&wparam(3));
   1880 	&mov	($key_,&wparam(4));
   1881 	&test	($len,$len);
   1882 	&jz	(&label("cbc_abort"));
   1883 
   1884 	&cmp	(&wparam(5),0);
   1885 	&xchg	($rounds_,"esp");		# alloca
   1886 	&movups	($ivec,&QWP(0,$key_));		# load IV
   1887 	&mov	($rounds,&DWP(240,$key));
   1888 	&mov	($key_,$key);			# backup $key
   1889 	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
   1890 	&mov	($rounds_,$rounds);		# backup $rounds
   1891 	&je	(&label("cbc_decrypt"));
   1892 
   1893 	&movaps	($inout0,$ivec);
   1894 	&cmp	($len,16);
   1895 	&jb	(&label("cbc_enc_tail"));
   1896 	&sub	($len,16);
   1897 	&jmp	(&label("cbc_enc_loop"));
   1898 
   1899 &set_label("cbc_enc_loop",16);
   1900 	&movups	($ivec,&QWP(0,$inp));		# input actually
   1901 	&lea	($inp,&DWP(16,$inp));
   1902 	if ($inline)
   1903 	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
   1904 	else
   1905 	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
   1906 	&mov	($rounds,$rounds_);	# restore $rounds
   1907 	&mov	($key,$key_);		# restore $key
   1908 	&movups	(&QWP(0,$out),$inout0);	# store output
   1909 	&lea	($out,&DWP(16,$out));
   1910 	&sub	($len,16);
   1911 	&jnc	(&label("cbc_enc_loop"));
   1912 	&add	($len,16);
   1913 	&jnz	(&label("cbc_enc_tail"));
   1914 	&movaps	($ivec,$inout0);
   1915 	&pxor	($inout0,$inout0);
   1916 	&jmp	(&label("cbc_ret"));
   1917 
   1918 &set_label("cbc_enc_tail");
   1919 	&mov	("ecx",$len);		# zaps $rounds
   1920 	&data_word(0xA4F3F689);		# rep movsb
   1921 	&mov	("ecx",16);		# zero tail
   1922 	&sub	("ecx",$len);
   1923 	&xor	("eax","eax");		# zaps $len
   1924 	&data_word(0xAAF3F689);		# rep stosb
   1925 	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
   1926 	&mov	($rounds,$rounds_);	# restore $rounds
   1927 	&mov	($inp,$out);		# $inp and $out are the same
   1928 	&mov	($key,$key_);		# restore $key
   1929 	&jmp	(&label("cbc_enc_loop"));
   1930 ######################################################################
   1931 &set_label("cbc_decrypt",16);
   1932 	&cmp	($len,0x50);
   1933 	&jbe	(&label("cbc_dec_tail"));
   1934 	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
   1935 	&sub	($len,0x50);
   1936 	&jmp	(&label("cbc_dec_loop6_enter"));
   1937 
   1938 &set_label("cbc_dec_loop6",16);
   1939 	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
   1940 	&movups	(&QWP(0,$out),$inout5);
   1941 	&lea	($out,&DWP(0x10,$out));
   1942 &set_label("cbc_dec_loop6_enter");
   1943 	&movdqu	($inout0,&QWP(0,$inp));
   1944 	&movdqu	($inout1,&QWP(0x10,$inp));
   1945 	&movdqu	($inout2,&QWP(0x20,$inp));
   1946 	&movdqu	($inout3,&QWP(0x30,$inp));
   1947 	&movdqu	($inout4,&QWP(0x40,$inp));
   1948 	&movdqu	($inout5,&QWP(0x50,$inp));
   1949 
   1950 	&call	("_aesni_decrypt6");
   1951 
   1952 	&movups	($rndkey1,&QWP(0,$inp));
   1953 	&movups	($rndkey0,&QWP(0x10,$inp));
   1954 	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
   1955 	&xorps	($inout1,$rndkey1);
   1956 	&movups	($rndkey1,&QWP(0x20,$inp));
   1957 	&xorps	($inout2,$rndkey0);
   1958 	&movups	($rndkey0,&QWP(0x30,$inp));
   1959 	&xorps	($inout3,$rndkey1);
   1960 	&movups	($rndkey1,&QWP(0x40,$inp));
   1961 	&xorps	($inout4,$rndkey0);
   1962 	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
   1963 	&xorps	($inout5,$rndkey1);
   1964 	&movups	(&QWP(0,$out),$inout0);
   1965 	&movups	(&QWP(0x10,$out),$inout1);
   1966 	&lea	($inp,&DWP(0x60,$inp));
   1967 	&movups	(&QWP(0x20,$out),$inout2);
   1968 	&mov	($rounds,$rounds_);		# restore $rounds
   1969 	&movups	(&QWP(0x30,$out),$inout3);
   1970 	&mov	($key,$key_);			# restore $key
   1971 	&movups	(&QWP(0x40,$out),$inout4);
   1972 	&lea	($out,&DWP(0x50,$out));
   1973 	&sub	($len,0x60);
   1974 	&ja	(&label("cbc_dec_loop6"));
   1975 
   1976 	&movaps	($inout0,$inout5);
   1977 	&movaps	($ivec,$rndkey0);
   1978 	&add	($len,0x50);
   1979 	&jle	(&label("cbc_dec_clear_tail_collected"));
   1980 	&movups	(&QWP(0,$out),$inout0);
   1981 	&lea	($out,&DWP(0x10,$out));
   1982 &set_label("cbc_dec_tail");
   1983 	&movups	($inout0,&QWP(0,$inp));
   1984 	&movaps	($in0,$inout0);
   1985 	&cmp	($len,0x10);
   1986 	&jbe	(&label("cbc_dec_one"));
   1987 
   1988 	&movups	($inout1,&QWP(0x10,$inp));
   1989 	&movaps	($in1,$inout1);
   1990 	&cmp	($len,0x20);
   1991 	&jbe	(&label("cbc_dec_two"));
   1992 
   1993 	&movups	($inout2,&QWP(0x20,$inp));
   1994 	&cmp	($len,0x30);
   1995 	&jbe	(&label("cbc_dec_three"));
   1996 
   1997 	&movups	($inout3,&QWP(0x30,$inp));
   1998 	&cmp	($len,0x40);
   1999 	&jbe	(&label("cbc_dec_four"));
   2000 
   2001 	&movups	($inout4,&QWP(0x40,$inp));
   2002 	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
   2003 	&movups	($inout0,&QWP(0,$inp));
   2004 	&xorps	($inout5,$inout5);
   2005 	&call	("_aesni_decrypt6");
   2006 	&movups	($rndkey1,&QWP(0,$inp));
   2007 	&movups	($rndkey0,&QWP(0x10,$inp));
   2008 	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
   2009 	&xorps	($inout1,$rndkey1);
   2010 	&movups	($rndkey1,&QWP(0x20,$inp));
   2011 	&xorps	($inout2,$rndkey0);
   2012 	&movups	($rndkey0,&QWP(0x30,$inp));
   2013 	&xorps	($inout3,$rndkey1);
   2014 	&movups	($ivec,&QWP(0x40,$inp));	# IV
   2015 	&xorps	($inout4,$rndkey0);
   2016 	&movups	(&QWP(0,$out),$inout0);
   2017 	&movups	(&QWP(0x10,$out),$inout1);
   2018 	&pxor	($inout1,$inout1);
   2019 	&movups	(&QWP(0x20,$out),$inout2);
   2020 	&pxor	($inout2,$inout2);
   2021 	&movups	(&QWP(0x30,$out),$inout3);
   2022 	&pxor	($inout3,$inout3);
   2023 	&lea	($out,&DWP(0x40,$out));
   2024 	&movaps	($inout0,$inout4);
   2025 	&pxor	($inout4,$inout4);
   2026 	&sub	($len,0x50);
   2027 	&jmp	(&label("cbc_dec_tail_collected"));
   2028 
   2029 &set_label("cbc_dec_one",16);
   2030 	if ($inline)
   2031 	{   &aesni_inline_generate1("dec");	}
   2032 	else
   2033 	{   &call	("_aesni_decrypt1");	}
   2034 	&xorps	($inout0,$ivec);
   2035 	&movaps	($ivec,$in0);
   2036 	&sub	($len,0x10);
   2037 	&jmp	(&label("cbc_dec_tail_collected"));
   2038 
   2039 &set_label("cbc_dec_two",16);
   2040 	&call	("_aesni_decrypt2");
   2041 	&xorps	($inout0,$ivec);
   2042 	&xorps	($inout1,$in0);
   2043 	&movups	(&QWP(0,$out),$inout0);
   2044 	&movaps	($inout0,$inout1);
   2045 	&pxor	($inout1,$inout1);
   2046 	&lea	($out,&DWP(0x10,$out));
   2047 	&movaps	($ivec,$in1);
   2048 	&sub	($len,0x20);
   2049 	&jmp	(&label("cbc_dec_tail_collected"));
   2050 
   2051 &set_label("cbc_dec_three",16);
   2052 	&call	("_aesni_decrypt3");
   2053 	&xorps	($inout0,$ivec);
   2054 	&xorps	($inout1,$in0);
   2055 	&xorps	($inout2,$in1);
   2056 	&movups	(&QWP(0,$out),$inout0);
   2057 	&movaps	($inout0,$inout2);
   2058 	&pxor	($inout2,$inout2);
   2059 	&movups	(&QWP(0x10,$out),$inout1);
   2060 	&pxor	($inout1,$inout1);
   2061 	&lea	($out,&DWP(0x20,$out));
   2062 	&movups	($ivec,&QWP(0x20,$inp));
   2063 	&sub	($len,0x30);
   2064 	&jmp	(&label("cbc_dec_tail_collected"));
   2065 
   2066 &set_label("cbc_dec_four",16);
   2067 	&call	("_aesni_decrypt4");
   2068 	&movups	($rndkey1,&QWP(0x10,$inp));
   2069 	&movups	($rndkey0,&QWP(0x20,$inp));
   2070 	&xorps	($inout0,$ivec);
   2071 	&movups	($ivec,&QWP(0x30,$inp));
   2072 	&xorps	($inout1,$in0);
   2073 	&movups	(&QWP(0,$out),$inout0);
   2074 	&xorps	($inout2,$rndkey1);
   2075 	&movups	(&QWP(0x10,$out),$inout1);
   2076 	&pxor	($inout1,$inout1);
   2077 	&xorps	($inout3,$rndkey0);
   2078 	&movups	(&QWP(0x20,$out),$inout2);
   2079 	&pxor	($inout2,$inout2);
   2080 	&lea	($out,&DWP(0x30,$out));
   2081 	&movaps	($inout0,$inout3);
   2082 	&pxor	($inout3,$inout3);
   2083 	&sub	($len,0x40);
   2084 	&jmp	(&label("cbc_dec_tail_collected"));
   2085 
   2086 &set_label("cbc_dec_clear_tail_collected",16);
   2087 	&pxor	($inout1,$inout1);
   2088 	&pxor	($inout2,$inout2);
   2089 	&pxor	($inout3,$inout3);
   2090 	&pxor	($inout4,$inout4);
   2091 &set_label("cbc_dec_tail_collected");
   2092 	&and	($len,15);
   2093 	&jnz	(&label("cbc_dec_tail_partial"));
   2094 	&movups	(&QWP(0,$out),$inout0);
   2095 	&pxor	($rndkey0,$rndkey0);
   2096 	&jmp	(&label("cbc_ret"));
   2097 
   2098 &set_label("cbc_dec_tail_partial",16);
   2099 	&movaps	(&QWP(0,"esp"),$inout0);
   2100 	&pxor	($rndkey0,$rndkey0);
   2101 	&mov	("ecx",16);
   2102 	&mov	($inp,"esp");
   2103 	&sub	("ecx",$len);
   2104 	&data_word(0xA4F3F689);		# rep movsb
   2105 	&movdqa	(&QWP(0,"esp"),$inout0);
   2106 
   2107 &set_label("cbc_ret");
   2108 	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
   2109 	&mov	($key_,&wparam(4));
   2110 	&pxor	($inout0,$inout0);
   2111 	&pxor	($rndkey1,$rndkey1);
   2112 	&movups	(&QWP(0,$key_),$ivec);	# output IV
   2113 	&pxor	($ivec,$ivec);
   2114 &set_label("cbc_abort");
   2115 &function_end("${PREFIX}_cbc_encrypt");
   2116 
   2118 ######################################################################
   2119 # Mechanical port from aesni-x86_64.pl.
   2120 #
   2121 # _aesni_set_encrypt_key is private interface,
   2122 # input:
   2123 #	"eax"	const unsigned char *userKey
   2124 #	$rounds	int bits
   2125 #	$key	AES_KEY *key
   2126 # output:
   2127 #	"eax"	return code
   2128 #	$round	rounds
   2129 
   2130 &function_begin_B("_aesni_set_encrypt_key");
   2131 	&push	("ebp");
   2132 	&push	("ebx");
   2133 	&test	("eax","eax");
   2134 	&jz	(&label("bad_pointer"));
   2135 	&test	($key,$key);
   2136 	&jz	(&label("bad_pointer"));
   2137 
   2138 	&call	(&label("pic"));
   2139 &set_label("pic");
   2140 	&blindpop("ebx");
   2141 	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
   2142 
   2143 	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
   2144 	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
   2145 	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
   2146 	&mov	("ebp",&DWP(4,"ebp"));
   2147 	&lea	($key,&DWP(16,$key));
   2148 	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
   2149 	&cmp	($rounds,256);
   2150 	&je	(&label("14rounds"));
   2151 	&cmp	($rounds,192);
   2152 	&je	(&label("12rounds"));
   2153 	&cmp	($rounds,128);
   2154 	&jne	(&label("bad_keybits"));
   2155 
   2156 &set_label("10rounds",16);
   2157 	&cmp		("ebp",1<<28);
   2158 	&je		(&label("10rounds_alt"));
   2159 
   2160 	&mov		($rounds,9);
   2161 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
   2162 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
   2163 	&call		(&label("key_128_cold"));
   2164 	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
   2165 	&call		(&label("key_128"));
   2166 	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
   2167 	&call		(&label("key_128"));
   2168 	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
   2169 	&call		(&label("key_128"));
   2170 	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
   2171 	&call		(&label("key_128"));
   2172 	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
   2173 	&call		(&label("key_128"));
   2174 	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
   2175 	&call		(&label("key_128"));
   2176 	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
   2177 	&call		(&label("key_128"));
   2178 	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
   2179 	&call		(&label("key_128"));
   2180 	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
   2181 	&call		(&label("key_128"));
   2182 	&$movekey	(&QWP(0,$key),"xmm0");
   2183 	&mov		(&DWP(80,$key),$rounds);
   2184 
   2185 	&jmp	(&label("good_key"));
   2186 
   2187 &set_label("key_128",16);
   2188 	&$movekey	(&QWP(0,$key),"xmm0");
   2189 	&lea		($key,&DWP(16,$key));
   2190 &set_label("key_128_cold");
   2191 	&shufps		("xmm4","xmm0",0b00010000);
   2192 	&xorps		("xmm0","xmm4");
   2193 	&shufps		("xmm4","xmm0",0b10001100);
   2194 	&xorps		("xmm0","xmm4");
   2195 	&shufps		("xmm1","xmm1",0b11111111);	# critical path
   2196 	&xorps		("xmm0","xmm1");
   2197 	&ret();
   2198 
   2199 &set_label("10rounds_alt",16);
   2200 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
   2201 	&mov		($rounds,8);
   2202 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2203 	&movdqa		("xmm2","xmm0");
   2204 	&movdqu		(&QWP(-16,$key),"xmm0");
   2205 
   2206 &set_label("loop_key128");
   2207 	&pshufb		("xmm0","xmm5");
   2208 	&aesenclast	("xmm0","xmm4");
   2209 	&pslld		("xmm4",1);
   2210 	&lea		($key,&DWP(16,$key));
   2211 
   2212 	&movdqa		("xmm3","xmm2");
   2213 	&pslldq		("xmm2",4);
   2214 	&pxor		("xmm3","xmm2");
   2215 	&pslldq		("xmm2",4);
   2216 	&pxor		("xmm3","xmm2");
   2217 	&pslldq		("xmm2",4);
   2218 	&pxor		("xmm2","xmm3");
   2219 
   2220 	&pxor		("xmm0","xmm2");
   2221 	&movdqu		(&QWP(-16,$key),"xmm0");
   2222 	&movdqa		("xmm2","xmm0");
   2223 
   2224 	&dec		($rounds);
   2225 	&jnz		(&label("loop_key128"));
   2226 
   2227 	&movdqa		("xmm4",&QWP(0x30,"ebx"));
   2228 
   2229 	&pshufb		("xmm0","xmm5");
   2230 	&aesenclast	("xmm0","xmm4");
   2231 	&pslld		("xmm4",1);
   2232 
   2233 	&movdqa		("xmm3","xmm2");
   2234 	&pslldq		("xmm2",4);
   2235 	&pxor		("xmm3","xmm2");
   2236 	&pslldq		("xmm2",4);
   2237 	&pxor		("xmm3","xmm2");
   2238 	&pslldq		("xmm2",4);
   2239 	&pxor		("xmm2","xmm3");
   2240 
   2241 	&pxor		("xmm0","xmm2");
   2242 	&movdqu		(&QWP(0,$key),"xmm0");
   2243 
   2244 	&movdqa		("xmm2","xmm0");
   2245 	&pshufb		("xmm0","xmm5");
   2246 	&aesenclast	("xmm0","xmm4");
   2247 
   2248 	&movdqa		("xmm3","xmm2");
   2249 	&pslldq		("xmm2",4);
   2250 	&pxor		("xmm3","xmm2");
   2251 	&pslldq		("xmm2",4);
   2252 	&pxor		("xmm3","xmm2");
   2253 	&pslldq		("xmm2",4);
   2254 	&pxor		("xmm2","xmm3");
   2255 
   2256 	&pxor		("xmm0","xmm2");
   2257 	&movdqu		(&QWP(16,$key),"xmm0");
   2258 
   2259 	&mov		($rounds,9);
   2260 	&mov		(&DWP(96,$key),$rounds);
   2261 
   2262 	&jmp	(&label("good_key"));
   2263 
   2264 &set_label("12rounds",16);
   2265 	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
   2266 	&cmp		("ebp",1<<28);
   2267 	&je		(&label("12rounds_alt"));
   2268 
   2269 	&mov		($rounds,11);
   2270 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
   2271 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
   2272 	&call		(&label("key_192a_cold"));
   2273 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
   2274 	&call		(&label("key_192b"));
   2275 	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
   2276 	&call		(&label("key_192a"));
   2277 	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
   2278 	&call		(&label("key_192b"));
   2279 	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
   2280 	&call		(&label("key_192a"));
   2281 	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
   2282 	&call		(&label("key_192b"));
   2283 	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
   2284 	&call		(&label("key_192a"));
   2285 	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
   2286 	&call		(&label("key_192b"));
   2287 	&$movekey	(&QWP(0,$key),"xmm0");
   2288 	&mov		(&DWP(48,$key),$rounds);
   2289 
   2290 	&jmp	(&label("good_key"));
   2291 
   2292 &set_label("key_192a",16);
   2293 	&$movekey	(&QWP(0,$key),"xmm0");
   2294 	&lea		($key,&DWP(16,$key));
   2295 &set_label("key_192a_cold",16);
   2296 	&movaps		("xmm5","xmm2");
   2297 &set_label("key_192b_warm");
   2298 	&shufps		("xmm4","xmm0",0b00010000);
   2299 	&movdqa		("xmm3","xmm2");
   2300 	&xorps		("xmm0","xmm4");
   2301 	&shufps		("xmm4","xmm0",0b10001100);
   2302 	&pslldq		("xmm3",4);
   2303 	&xorps		("xmm0","xmm4");
   2304 	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
   2305 	&pxor		("xmm2","xmm3");
   2306 	&pxor		("xmm0","xmm1");
   2307 	&pshufd		("xmm3","xmm0",0b11111111);
   2308 	&pxor		("xmm2","xmm3");
   2309 	&ret();
   2310 
   2311 &set_label("key_192b",16);
   2312 	&movaps		("xmm3","xmm0");
   2313 	&shufps		("xmm5","xmm0",0b01000100);
   2314 	&$movekey	(&QWP(0,$key),"xmm5");
   2315 	&shufps		("xmm3","xmm2",0b01001110);
   2316 	&$movekey	(&QWP(16,$key),"xmm3");
   2317 	&lea		($key,&DWP(32,$key));
   2318 	&jmp		(&label("key_192b_warm"));
   2319 
   2320 &set_label("12rounds_alt",16);
   2321 	&movdqa		("xmm5",&QWP(0x10,"ebx"));
   2322 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2323 	&mov		($rounds,8);
   2324 	&movdqu		(&QWP(-16,$key),"xmm0");
   2325 
   2326 &set_label("loop_key192");
   2327 	&movq		(&QWP(0,$key),"xmm2");
   2328 	&movdqa		("xmm1","xmm2");
   2329 	&pshufb		("xmm2","xmm5");
   2330 	&aesenclast	("xmm2","xmm4");
   2331 	&pslld		("xmm4",1);
   2332 	&lea		($key,&DWP(24,$key));
   2333 
   2334 	&movdqa		("xmm3","xmm0");
   2335 	&pslldq		("xmm0",4);
   2336 	&pxor		("xmm3","xmm0");
   2337 	&pslldq		("xmm0",4);
   2338 	&pxor		("xmm3","xmm0");
   2339 	&pslldq		("xmm0",4);
   2340 	&pxor		("xmm0","xmm3");
   2341 
   2342 	&pshufd		("xmm3","xmm0",0xff);
   2343 	&pxor		("xmm3","xmm1");
   2344 	&pslldq		("xmm1",4);
   2345 	&pxor		("xmm3","xmm1");
   2346 
   2347 	&pxor		("xmm0","xmm2");
   2348 	&pxor		("xmm2","xmm3");
   2349 	&movdqu		(&QWP(-16,$key),"xmm0");
   2350 
   2351 	&dec		($rounds);
   2352 	&jnz		(&label("loop_key192"));
   2353 
   2354 	&mov	($rounds,11);
   2355 	&mov	(&DWP(32,$key),$rounds);
   2356 
   2357 	&jmp	(&label("good_key"));
   2358 
   2359 &set_label("14rounds",16);
   2360 	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
   2361 	&lea		($key,&DWP(16,$key));
   2362 	&cmp		("ebp",1<<28);
   2363 	&je		(&label("14rounds_alt"));
   2364 
   2365 	&mov		($rounds,13);
   2366 	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
   2367 	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
   2368 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
   2369 	&call		(&label("key_256a_cold"));
   2370 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
   2371 	&call		(&label("key_256b"));
   2372 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
   2373 	&call		(&label("key_256a"));
   2374 	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
   2375 	&call		(&label("key_256b"));
   2376 	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
   2377 	&call		(&label("key_256a"));
   2378 	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
   2379 	&call		(&label("key_256b"));
   2380 	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
   2381 	&call		(&label("key_256a"));
   2382 	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
   2383 	&call		(&label("key_256b"));
   2384 	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
   2385 	&call		(&label("key_256a"));
   2386 	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
   2387 	&call		(&label("key_256b"));
   2388 	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
   2389 	&call		(&label("key_256a"));
   2390 	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
   2391 	&call		(&label("key_256b"));
   2392 	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
   2393 	&call		(&label("key_256a"));
   2394 	&$movekey	(&QWP(0,$key),"xmm0");
   2395 	&mov		(&DWP(16,$key),$rounds);
   2396 	&xor		("eax","eax");
   2397 
   2398 	&jmp	(&label("good_key"));
   2399 
   2400 &set_label("key_256a",16);
   2401 	&$movekey	(&QWP(0,$key),"xmm2");
   2402 	&lea		($key,&DWP(16,$key));
   2403 &set_label("key_256a_cold");
   2404 	&shufps		("xmm4","xmm0",0b00010000);
   2405 	&xorps		("xmm0","xmm4");
   2406 	&shufps		("xmm4","xmm0",0b10001100);
   2407 	&xorps		("xmm0","xmm4");
   2408 	&shufps		("xmm1","xmm1",0b11111111);	# critical path
   2409 	&xorps		("xmm0","xmm1");
   2410 	&ret();
   2411 
   2412 &set_label("key_256b",16);
   2413 	&$movekey	(&QWP(0,$key),"xmm0");
   2414 	&lea		($key,&DWP(16,$key));
   2415 
   2416 	&shufps		("xmm4","xmm2",0b00010000);
   2417 	&xorps		("xmm2","xmm4");
   2418 	&shufps		("xmm4","xmm2",0b10001100);
   2419 	&xorps		("xmm2","xmm4");
   2420 	&shufps		("xmm1","xmm1",0b10101010);	# critical path
   2421 	&xorps		("xmm2","xmm1");
   2422 	&ret();
   2423 
   2424 &set_label("14rounds_alt",16);
   2425 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
   2426 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2427 	&mov		($rounds,7);
   2428 	&movdqu		(&QWP(-32,$key),"xmm0");
   2429 	&movdqa		("xmm1","xmm2");
   2430 	&movdqu		(&QWP(-16,$key),"xmm2");
   2431 
   2432 &set_label("loop_key256");
   2433 	&pshufb		("xmm2","xmm5");
   2434 	&aesenclast	("xmm2","xmm4");
   2435 
   2436 	&movdqa		("xmm3","xmm0");
   2437 	&pslldq		("xmm0",4);
   2438 	&pxor		("xmm3","xmm0");
   2439 	&pslldq		("xmm0",4);
   2440 	&pxor		("xmm3","xmm0");
   2441 	&pslldq		("xmm0",4);
   2442 	&pxor		("xmm0","xmm3");
   2443 	&pslld		("xmm4",1);
   2444 
   2445 	&pxor		("xmm0","xmm2");
   2446 	&movdqu		(&QWP(0,$key),"xmm0");
   2447 
   2448 	&dec		($rounds);
   2449 	&jz		(&label("done_key256"));
   2450 
   2451 	&pshufd		("xmm2","xmm0",0xff);
   2452 	&pxor		("xmm3","xmm3");
   2453 	&aesenclast	("xmm2","xmm3");
   2454 
   2455 	&movdqa		("xmm3","xmm1");
   2456 	&pslldq		("xmm1",4);
   2457 	&pxor		("xmm3","xmm1");
   2458 	&pslldq		("xmm1",4);
   2459 	&pxor		("xmm3","xmm1");
   2460 	&pslldq		("xmm1",4);
   2461 	&pxor		("xmm1","xmm3");
   2462 
   2463 	&pxor		("xmm2","xmm1");
   2464 	&movdqu		(&QWP(16,$key),"xmm2");
   2465 	&lea		($key,&DWP(32,$key));
   2466 	&movdqa		("xmm1","xmm2");
   2467 	&jmp		(&label("loop_key256"));
   2468 
   2469 &set_label("done_key256");
   2470 	&mov		($rounds,13);
   2471 	&mov		(&DWP(16,$key),$rounds);
   2472 
   2473 &set_label("good_key");
   2474 	&pxor	("xmm0","xmm0");
   2475 	&pxor	("xmm1","xmm1");
   2476 	&pxor	("xmm2","xmm2");
   2477 	&pxor	("xmm3","xmm3");
   2478 	&pxor	("xmm4","xmm4");
   2479 	&pxor	("xmm5","xmm5");
   2480 	&xor	("eax","eax");
   2481 	&pop	("ebx");
   2482 	&pop	("ebp");
   2483 	&ret	();
   2484 
   2485 &set_label("bad_pointer",4);
   2486 	&mov	("eax",-1);
   2487 	&pop	("ebx");
   2488 	&pop	("ebp");
   2489 	&ret	();
   2490 &set_label("bad_keybits",4);
   2491 	&pxor	("xmm0","xmm0");
   2492 	&mov	("eax",-2);
   2493 	&pop	("ebx");
   2494 	&pop	("ebp");
   2495 	&ret	();
   2496 &function_end_B("_aesni_set_encrypt_key");
   2497 
   2498 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
   2499 #                              AES_KEY *key)
   2500 &function_begin_B("${PREFIX}_set_encrypt_key");
   2501 	&record_function_hit(3);
   2502 
   2503 	&mov	("eax",&wparam(0));
   2504 	&mov	($rounds,&wparam(1));
   2505 	&mov	($key,&wparam(2));
   2506 	&call	("_aesni_set_encrypt_key");
   2507 	&ret	();
   2508 &function_end_B("${PREFIX}_set_encrypt_key");
   2509 
   2510 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
   2511 #                              AES_KEY *key)
   2512 &function_begin_B("${PREFIX}_set_decrypt_key");
   2513 	&mov	("eax",&wparam(0));
   2514 	&mov	($rounds,&wparam(1));
   2515 	&mov	($key,&wparam(2));
   2516 	&call	("_aesni_set_encrypt_key");
   2517 	&mov	($key,&wparam(2));
   2518 	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
   2519 	&test	("eax","eax");
   2520 	&jnz	(&label("dec_key_ret"));
   2521 	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
   2522 
   2523 	&$movekey	("xmm0",&QWP(0,$key));	# just swap
   2524 	&$movekey	("xmm1",&QWP(0,"eax"));
   2525 	&$movekey	(&QWP(0,"eax"),"xmm0");
   2526 	&$movekey	(&QWP(0,$key),"xmm1");
   2527 	&lea		($key,&DWP(16,$key));
   2528 	&lea		("eax",&DWP(-16,"eax"));
   2529 
   2530 &set_label("dec_key_inverse");
   2531 	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
   2532 	&$movekey	("xmm1",&QWP(0,"eax"));
   2533 	&aesimc		("xmm0","xmm0");
   2534 	&aesimc		("xmm1","xmm1");
   2535 	&lea		($key,&DWP(16,$key));
   2536 	&lea		("eax",&DWP(-16,"eax"));
   2537 	&$movekey	(&QWP(16,"eax"),"xmm0");
   2538 	&$movekey	(&QWP(-16,$key),"xmm1");
   2539 	&cmp		("eax",$key);
   2540 	&ja		(&label("dec_key_inverse"));
   2541 
   2542 	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
   2543 	&aesimc		("xmm0","xmm0");
   2544 	&$movekey	(&QWP(0,$key),"xmm0");
   2545 
   2546 	&pxor		("xmm0","xmm0");
   2547 	&pxor		("xmm1","xmm1");
   2548 	&xor		("eax","eax");		# return success
   2549 &set_label("dec_key_ret");
   2550 	&ret	();
   2551 &function_end_B("${PREFIX}_set_decrypt_key");
   2552 
   2553 &set_label("key_const",64);
   2554 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
   2555 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
   2556 &data_word(1,1,1,1);
   2557 &data_word(0x1b,0x1b,0x1b,0x1b);
   2558 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
   2559 
   2560 &asm_finish();
   2561 
   2562 close STDOUT;
   2563