Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # This module implements support for Intel AES-NI extension. In
     11 # OpenSSL context it's used with Intel engine, but can also be used as
     12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
     13 # details].
     14 #
     15 # Performance.
     16 #
     17 # To start with see corresponding paragraph in aesni-x86_64.pl...
     18 # Instead of filling table similar to one found there I've chosen to
     19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
     20 # The simplified table below represents 32-bit performance relative
     21 # to 64-bit one in every given point. Ratios vary for different
     22 # encryption modes, therefore interval values.
     23 #
     24 #	16-byte     64-byte     256-byte    1-KB        8-KB
     25 #	53-67%      67-84%      91-94%      95-98%      97-99.5%
     26 #
     27 # Lower ratios for smaller block sizes are perfectly understandable,
     28 # because function call overhead is higher in 32-bit mode. Largest
     29 # 8-KB block performance is virtually same: 32-bit code is less than
     30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
     31 
     32 # January 2011
     33 #
     34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
     35 # interleaves at most 6 aes[enc|dec] instructions, because there are
     36 # not enough registers for 8x interleave [which should be optimal for
     37 # Sandy Bridge]. Actually, performance results for 6x interleave
     38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
     39 # module.
     40 
     41 # April 2011
     42 #
     43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
     44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
     45 
     46 ######################################################################
     47 # Current large-block performance in cycles per byte processed with
     48 # 128-bit key (less is better).
     49 #
     50 #		CBC en-/decrypt	CTR	XTS	ECB
     51 # Westmere	3.77/1.37	1.37	1.52	1.27
     52 # * Bridge	5.07/0.98	0.99	1.09	0.91
     53 # Haswell	4.44/0.80	0.97	1.03	0.72
     54 # Silvermont	5.77/3.56	3.67	4.03	3.46
     55 # Bulldozer	5.80/0.98	1.05	1.24	0.93
     56 
     57 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
     58 			# generates drop-in replacement for
     59 			# crypto/aes/asm/aes-586.pl:-)
     60 $inline=1;		# inline _aesni_[en|de]crypt
     61 
     62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     63 push(@INC,"${dir}","${dir}../../perlasm");
     64 require "x86asm.pl";
     65 
     66 &asm_init($ARGV[0],$0);
     67 
     68 &external_label("OPENSSL_ia32cap_P");
     69 &static_label("key_const");
     70 
     71 if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
     72 else			{ $movekey=\&movups; }
     73 
     74 $len="eax";
     75 $rounds="ecx";
     76 $key="edx";
     77 $inp="esi";
     78 $out="edi";
     79 $rounds_="ebx";	# backup copy for $rounds
     80 $key_="ebp";	# backup copy for $key
     81 
     82 $rndkey0="xmm0";
     83 $rndkey1="xmm1";
     84 $inout0="xmm2";
     85 $inout1="xmm3";
     86 $inout2="xmm4";
     87 $inout3="xmm5";	$in1="xmm5";
     88 $inout4="xmm6";	$in0="xmm6";
     89 $inout5="xmm7";	$ivec="xmm7";
     90 
     91 # AESNI extension
     92 sub aeskeygenassist
     93 { my($dst,$src,$imm)=@_;
     94     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
     95     {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
     96 }
     97 sub aescommon
     98 { my($opcodelet,$dst,$src)=@_;
     99     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    100     {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
    101 }
    102 sub aesimc	{ aescommon(0xdb,@_); }
    103 sub aesenc	{ aescommon(0xdc,@_); }
    104 sub aesenclast	{ aescommon(0xdd,@_); }
    105 sub aesdec	{ aescommon(0xde,@_); }
    106 sub aesdeclast	{ aescommon(0xdf,@_); }
    107 
    109 # Inline version of internal aesni_[en|de]crypt1
    110 { my $sn;
    111 sub aesni_inline_generate1
    112 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
    113   $sn++;
    114 
    115     &$movekey		($rndkey0,&QWP(0,$key));
    116     &$movekey		($rndkey1,&QWP(16,$key));
    117     &xorps		($ivec,$rndkey0)	if (defined($ivec));
    118     &lea		($key,&DWP(32,$key));
    119     &xorps		($inout,$ivec)		if (defined($ivec));
    120     &xorps		($inout,$rndkey0)	if (!defined($ivec));
    121     &set_label("${p}1_loop_$sn");
    122 	eval"&aes${p}	($inout,$rndkey1)";
    123 	&dec		($rounds);
    124 	&$movekey	($rndkey1,&QWP(0,$key));
    125 	&lea		($key,&DWP(16,$key));
    126     &jnz		(&label("${p}1_loop_$sn"));
    127     eval"&aes${p}last	($inout,$rndkey1)";
    128 }}
    129 
    130 sub aesni_generate1	# fully unrolled loop
    131 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
    132 
    133     &function_begin_B("_aesni_${p}rypt1");
    134 	&movups		($rndkey0,&QWP(0,$key));
    135 	&$movekey	($rndkey1,&QWP(0x10,$key));
    136 	&xorps		($inout,$rndkey0);
    137 	&$movekey	($rndkey0,&QWP(0x20,$key));
    138 	&lea		($key,&DWP(0x30,$key));
    139 	&cmp		($rounds,11);
    140 	&jb		(&label("${p}128"));
    141 	&lea		($key,&DWP(0x20,$key));
    142 	&je		(&label("${p}192"));
    143 	&lea		($key,&DWP(0x20,$key));
    144 	eval"&aes${p}	($inout,$rndkey1)";
    145 	&$movekey	($rndkey1,&QWP(-0x40,$key));
    146 	eval"&aes${p}	($inout,$rndkey0)";
    147 	&$movekey	($rndkey0,&QWP(-0x30,$key));
    148     &set_label("${p}192");
    149 	eval"&aes${p}	($inout,$rndkey1)";
    150 	&$movekey	($rndkey1,&QWP(-0x20,$key));
    151 	eval"&aes${p}	($inout,$rndkey0)";
    152 	&$movekey	($rndkey0,&QWP(-0x10,$key));
    153     &set_label("${p}128");
    154 	eval"&aes${p}	($inout,$rndkey1)";
    155 	&$movekey	($rndkey1,&QWP(0,$key));
    156 	eval"&aes${p}	($inout,$rndkey0)";
    157 	&$movekey	($rndkey0,&QWP(0x10,$key));
    158 	eval"&aes${p}	($inout,$rndkey1)";
    159 	&$movekey	($rndkey1,&QWP(0x20,$key));
    160 	eval"&aes${p}	($inout,$rndkey0)";
    161 	&$movekey	($rndkey0,&QWP(0x30,$key));
    162 	eval"&aes${p}	($inout,$rndkey1)";
    163 	&$movekey	($rndkey1,&QWP(0x40,$key));
    164 	eval"&aes${p}	($inout,$rndkey0)";
    165 	&$movekey	($rndkey0,&QWP(0x50,$key));
    166 	eval"&aes${p}	($inout,$rndkey1)";
    167 	&$movekey	($rndkey1,&QWP(0x60,$key));
    168 	eval"&aes${p}	($inout,$rndkey0)";
    169 	&$movekey	($rndkey0,&QWP(0x70,$key));
    170 	eval"&aes${p}	($inout,$rndkey1)";
    171     eval"&aes${p}last	($inout,$rndkey0)";
    172     &ret();
    173     &function_end_B("_aesni_${p}rypt1");
    174 }
    175 
    177 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
    178 &aesni_generate1("enc") if (!$inline);
    179 &function_begin_B("${PREFIX}_encrypt");
    180 	&mov	("eax",&wparam(0));
    181 	&mov	($key,&wparam(2));
    182 	&movups	($inout0,&QWP(0,"eax"));
    183 	&mov	($rounds,&DWP(240,$key));
    184 	&mov	("eax",&wparam(1));
    185 	if ($inline)
    186 	{   &aesni_inline_generate1("enc");	}
    187 	else
    188 	{   &call	("_aesni_encrypt1");	}
    189 	&pxor	($rndkey0,$rndkey0);		# clear register bank
    190 	&pxor	($rndkey1,$rndkey1);
    191 	&movups	(&QWP(0,"eax"),$inout0);
    192 	&pxor	($inout0,$inout0);
    193 	&ret	();
    194 &function_end_B("${PREFIX}_encrypt");
    195 
    196 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
    197 &aesni_generate1("dec") if(!$inline);
    198 &function_begin_B("${PREFIX}_decrypt");
    199 	&mov	("eax",&wparam(0));
    200 	&mov	($key,&wparam(2));
    201 	&movups	($inout0,&QWP(0,"eax"));
    202 	&mov	($rounds,&DWP(240,$key));
    203 	&mov	("eax",&wparam(1));
    204 	if ($inline)
    205 	{   &aesni_inline_generate1("dec");	}
    206 	else
    207 	{   &call	("_aesni_decrypt1");	}
    208 	&pxor	($rndkey0,$rndkey0);		# clear register bank
    209 	&pxor	($rndkey1,$rndkey1);
    210 	&movups	(&QWP(0,"eax"),$inout0);
    211 	&pxor	($inout0,$inout0);
    212 	&ret	();
    213 &function_end_B("${PREFIX}_decrypt");
    214 
    215 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
    216 # factor. Why 3x subroutine were originally used in loops? Even though
    217 # aes[enc|dec] latency was originally 6, it could be scheduled only
    218 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
    219 # utilization, i.e. when subroutine's throughput is virtually same as
    220 # of non-interleaved subroutine [for number of input blocks up to 3].
    221 # This is why it originally made no sense to implement 2x subroutine.
    222 # But times change and it became appropriate to spend extra 192 bytes
    223 # on 2x subroutine on Atom Silvermont account. For processors that
    224 # can schedule aes[enc|dec] every cycle optimal interleave factor
    225 # equals to corresponding instructions latency. 8x is optimal for
    226 # * Bridge, but it's unfeasible to accommodate such implementation
    227 # in XMM registers addreassable in 32-bit mode and therefore maximum
    228 # of 6x is used instead...
    229 
    230 sub aesni_generate2
    231 { my $p=shift;
    232 
    233     &function_begin_B("_aesni_${p}rypt2");
    234 	&$movekey	($rndkey0,&QWP(0,$key));
    235 	&shl		($rounds,4);
    236 	&$movekey	($rndkey1,&QWP(16,$key));
    237 	&xorps		($inout0,$rndkey0);
    238 	&pxor		($inout1,$rndkey0);
    239 	&$movekey	($rndkey0,&QWP(32,$key));
    240 	&lea		($key,&DWP(32,$key,$rounds));
    241 	&neg		($rounds);
    242 	&add		($rounds,16);
    243 
    244     &set_label("${p}2_loop");
    245 	eval"&aes${p}	($inout0,$rndkey1)";
    246 	eval"&aes${p}	($inout1,$rndkey1)";
    247 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    248 	&add		($rounds,32);
    249 	eval"&aes${p}	($inout0,$rndkey0)";
    250 	eval"&aes${p}	($inout1,$rndkey0)";
    251 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    252 	&jnz		(&label("${p}2_loop"));
    253     eval"&aes${p}	($inout0,$rndkey1)";
    254     eval"&aes${p}	($inout1,$rndkey1)";
    255     eval"&aes${p}last	($inout0,$rndkey0)";
    256     eval"&aes${p}last	($inout1,$rndkey0)";
    257     &ret();
    258     &function_end_B("_aesni_${p}rypt2");
    259 }
    260 
    261 sub aesni_generate3
    262 { my $p=shift;
    263 
    264     &function_begin_B("_aesni_${p}rypt3");
    265 	&$movekey	($rndkey0,&QWP(0,$key));
    266 	&shl		($rounds,4);
    267 	&$movekey	($rndkey1,&QWP(16,$key));
    268 	&xorps		($inout0,$rndkey0);
    269 	&pxor		($inout1,$rndkey0);
    270 	&pxor		($inout2,$rndkey0);
    271 	&$movekey	($rndkey0,&QWP(32,$key));
    272 	&lea		($key,&DWP(32,$key,$rounds));
    273 	&neg		($rounds);
    274 	&add		($rounds,16);
    275 
    276     &set_label("${p}3_loop");
    277 	eval"&aes${p}	($inout0,$rndkey1)";
    278 	eval"&aes${p}	($inout1,$rndkey1)";
    279 	eval"&aes${p}	($inout2,$rndkey1)";
    280 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    281 	&add		($rounds,32);
    282 	eval"&aes${p}	($inout0,$rndkey0)";
    283 	eval"&aes${p}	($inout1,$rndkey0)";
    284 	eval"&aes${p}	($inout2,$rndkey0)";
    285 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    286 	&jnz		(&label("${p}3_loop"));
    287     eval"&aes${p}	($inout0,$rndkey1)";
    288     eval"&aes${p}	($inout1,$rndkey1)";
    289     eval"&aes${p}	($inout2,$rndkey1)";
    290     eval"&aes${p}last	($inout0,$rndkey0)";
    291     eval"&aes${p}last	($inout1,$rndkey0)";
    292     eval"&aes${p}last	($inout2,$rndkey0)";
    293     &ret();
    294     &function_end_B("_aesni_${p}rypt3");
    295 }
    296 
    297 # 4x interleave is implemented to improve small block performance,
    298 # most notably [and naturally] 4 block by ~30%. One can argue that one
    299 # should have implemented 5x as well, but improvement  would be <20%,
    300 # so it's not worth it...
    301 sub aesni_generate4
    302 { my $p=shift;
    303 
    304     &function_begin_B("_aesni_${p}rypt4");
    305 	&$movekey	($rndkey0,&QWP(0,$key));
    306 	&$movekey	($rndkey1,&QWP(16,$key));
    307 	&shl		($rounds,4);
    308 	&xorps		($inout0,$rndkey0);
    309 	&pxor		($inout1,$rndkey0);
    310 	&pxor		($inout2,$rndkey0);
    311 	&pxor		($inout3,$rndkey0);
    312 	&$movekey	($rndkey0,&QWP(32,$key));
    313 	&lea		($key,&DWP(32,$key,$rounds));
    314 	&neg		($rounds);
    315 	&data_byte	(0x0f,0x1f,0x40,0x00);
    316 	&add		($rounds,16);
    317 
    318     &set_label("${p}4_loop");
    319 	eval"&aes${p}	($inout0,$rndkey1)";
    320 	eval"&aes${p}	($inout1,$rndkey1)";
    321 	eval"&aes${p}	($inout2,$rndkey1)";
    322 	eval"&aes${p}	($inout3,$rndkey1)";
    323 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    324 	&add		($rounds,32);
    325 	eval"&aes${p}	($inout0,$rndkey0)";
    326 	eval"&aes${p}	($inout1,$rndkey0)";
    327 	eval"&aes${p}	($inout2,$rndkey0)";
    328 	eval"&aes${p}	($inout3,$rndkey0)";
    329 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    330     &jnz		(&label("${p}4_loop"));
    331 
    332     eval"&aes${p}	($inout0,$rndkey1)";
    333     eval"&aes${p}	($inout1,$rndkey1)";
    334     eval"&aes${p}	($inout2,$rndkey1)";
    335     eval"&aes${p}	($inout3,$rndkey1)";
    336     eval"&aes${p}last	($inout0,$rndkey0)";
    337     eval"&aes${p}last	($inout1,$rndkey0)";
    338     eval"&aes${p}last	($inout2,$rndkey0)";
    339     eval"&aes${p}last	($inout3,$rndkey0)";
    340     &ret();
    341     &function_end_B("_aesni_${p}rypt4");
    342 }
    343 
    344 sub aesni_generate6
    345 { my $p=shift;
    346 
    347     &function_begin_B("_aesni_${p}rypt6");
    348     &static_label("_aesni_${p}rypt6_enter");
    349 	&$movekey	($rndkey0,&QWP(0,$key));
    350 	&shl		($rounds,4);
    351 	&$movekey	($rndkey1,&QWP(16,$key));
    352 	&xorps		($inout0,$rndkey0);
    353 	&pxor		($inout1,$rndkey0);	# pxor does better here
    354 	&pxor		($inout2,$rndkey0);
    355 	eval"&aes${p}	($inout0,$rndkey1)";
    356 	&pxor		($inout3,$rndkey0);
    357 	&pxor		($inout4,$rndkey0);
    358 	eval"&aes${p}	($inout1,$rndkey1)";
    359 	&lea		($key,&DWP(32,$key,$rounds));
    360 	&neg		($rounds);
    361 	eval"&aes${p}	($inout2,$rndkey1)";
    362 	&pxor		($inout5,$rndkey0);
    363 	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
    364 	&add		($rounds,16);
    365 	&jmp		(&label("_aesni_${p}rypt6_inner"));
    366 
    367     &set_label("${p}6_loop",16);
    368 	eval"&aes${p}	($inout0,$rndkey1)";
    369 	eval"&aes${p}	($inout1,$rndkey1)";
    370 	eval"&aes${p}	($inout2,$rndkey1)";
    371     &set_label("_aesni_${p}rypt6_inner");
    372 	eval"&aes${p}	($inout3,$rndkey1)";
    373 	eval"&aes${p}	($inout4,$rndkey1)";
    374 	eval"&aes${p}	($inout5,$rndkey1)";
    375     &set_label("_aesni_${p}rypt6_enter");
    376 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    377 	&add		($rounds,32);
    378 	eval"&aes${p}	($inout0,$rndkey0)";
    379 	eval"&aes${p}	($inout1,$rndkey0)";
    380 	eval"&aes${p}	($inout2,$rndkey0)";
    381 	eval"&aes${p}	($inout3,$rndkey0)";
    382 	eval"&aes${p}	($inout4,$rndkey0)";
    383 	eval"&aes${p}	($inout5,$rndkey0)";
    384 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    385     &jnz		(&label("${p}6_loop"));
    386 
    387     eval"&aes${p}	($inout0,$rndkey1)";
    388     eval"&aes${p}	($inout1,$rndkey1)";
    389     eval"&aes${p}	($inout2,$rndkey1)";
    390     eval"&aes${p}	($inout3,$rndkey1)";
    391     eval"&aes${p}	($inout4,$rndkey1)";
    392     eval"&aes${p}	($inout5,$rndkey1)";
    393     eval"&aes${p}last	($inout0,$rndkey0)";
    394     eval"&aes${p}last	($inout1,$rndkey0)";
    395     eval"&aes${p}last	($inout2,$rndkey0)";
    396     eval"&aes${p}last	($inout3,$rndkey0)";
    397     eval"&aes${p}last	($inout4,$rndkey0)";
    398     eval"&aes${p}last	($inout5,$rndkey0)";
    399     &ret();
    400     &function_end_B("_aesni_${p}rypt6");
    401 }
    402 &aesni_generate2("enc") if ($PREFIX eq "aesni");
    403 &aesni_generate2("dec");
    404 &aesni_generate3("enc") if ($PREFIX eq "aesni");
    405 &aesni_generate3("dec");
    406 &aesni_generate4("enc") if ($PREFIX eq "aesni");
    407 &aesni_generate4("dec");
    408 &aesni_generate6("enc") if ($PREFIX eq "aesni");
    409 &aesni_generate6("dec");
    410 
    412 if ($PREFIX eq "aesni") {
    413 ######################################################################
    414 # void aesni_ecb_encrypt (const void *in, void *out,
    415 #                         size_t length, const AES_KEY *key,
    416 #                         int enc);
    417 &function_begin("aesni_ecb_encrypt");
    418 	&mov	($inp,&wparam(0));
    419 	&mov	($out,&wparam(1));
    420 	&mov	($len,&wparam(2));
    421 	&mov	($key,&wparam(3));
    422 	&mov	($rounds_,&wparam(4));
    423 	&and	($len,-16);
    424 	&jz	(&label("ecb_ret"));
    425 	&mov	($rounds,&DWP(240,$key));
    426 	&test	($rounds_,$rounds_);
    427 	&jz	(&label("ecb_decrypt"));
    428 
    429 	&mov	($key_,$key);		# backup $key
    430 	&mov	($rounds_,$rounds);	# backup $rounds
    431 	&cmp	($len,0x60);
    432 	&jb	(&label("ecb_enc_tail"));
    433 
    434 	&movdqu	($inout0,&QWP(0,$inp));
    435 	&movdqu	($inout1,&QWP(0x10,$inp));
    436 	&movdqu	($inout2,&QWP(0x20,$inp));
    437 	&movdqu	($inout3,&QWP(0x30,$inp));
    438 	&movdqu	($inout4,&QWP(0x40,$inp));
    439 	&movdqu	($inout5,&QWP(0x50,$inp));
    440 	&lea	($inp,&DWP(0x60,$inp));
    441 	&sub	($len,0x60);
    442 	&jmp	(&label("ecb_enc_loop6_enter"));
    443 
    444 &set_label("ecb_enc_loop6",16);
    445 	&movups	(&QWP(0,$out),$inout0);
    446 	&movdqu	($inout0,&QWP(0,$inp));
    447 	&movups	(&QWP(0x10,$out),$inout1);
    448 	&movdqu	($inout1,&QWP(0x10,$inp));
    449 	&movups	(&QWP(0x20,$out),$inout2);
    450 	&movdqu	($inout2,&QWP(0x20,$inp));
    451 	&movups	(&QWP(0x30,$out),$inout3);
    452 	&movdqu	($inout3,&QWP(0x30,$inp));
    453 	&movups	(&QWP(0x40,$out),$inout4);
    454 	&movdqu	($inout4,&QWP(0x40,$inp));
    455 	&movups	(&QWP(0x50,$out),$inout5);
    456 	&lea	($out,&DWP(0x60,$out));
    457 	&movdqu	($inout5,&QWP(0x50,$inp));
    458 	&lea	($inp,&DWP(0x60,$inp));
    459 &set_label("ecb_enc_loop6_enter");
    460 
    461 	&call	("_aesni_encrypt6");
    462 
    463 	&mov	($key,$key_);		# restore $key
    464 	&mov	($rounds,$rounds_);	# restore $rounds
    465 	&sub	($len,0x60);
    466 	&jnc	(&label("ecb_enc_loop6"));
    467 
    468 	&movups	(&QWP(0,$out),$inout0);
    469 	&movups	(&QWP(0x10,$out),$inout1);
    470 	&movups	(&QWP(0x20,$out),$inout2);
    471 	&movups	(&QWP(0x30,$out),$inout3);
    472 	&movups	(&QWP(0x40,$out),$inout4);
    473 	&movups	(&QWP(0x50,$out),$inout5);
    474 	&lea	($out,&DWP(0x60,$out));
    475 	&add	($len,0x60);
    476 	&jz	(&label("ecb_ret"));
    477 
    478 &set_label("ecb_enc_tail");
    479 	&movups	($inout0,&QWP(0,$inp));
    480 	&cmp	($len,0x20);
    481 	&jb	(&label("ecb_enc_one"));
    482 	&movups	($inout1,&QWP(0x10,$inp));
    483 	&je	(&label("ecb_enc_two"));
    484 	&movups	($inout2,&QWP(0x20,$inp));
    485 	&cmp	($len,0x40);
    486 	&jb	(&label("ecb_enc_three"));
    487 	&movups	($inout3,&QWP(0x30,$inp));
    488 	&je	(&label("ecb_enc_four"));
    489 	&movups	($inout4,&QWP(0x40,$inp));
    490 	&xorps	($inout5,$inout5);
    491 	&call	("_aesni_encrypt6");
    492 	&movups	(&QWP(0,$out),$inout0);
    493 	&movups	(&QWP(0x10,$out),$inout1);
    494 	&movups	(&QWP(0x20,$out),$inout2);
    495 	&movups	(&QWP(0x30,$out),$inout3);
    496 	&movups	(&QWP(0x40,$out),$inout4);
    497 	jmp	(&label("ecb_ret"));
    498 
    499 &set_label("ecb_enc_one",16);
    500 	if ($inline)
    501 	{   &aesni_inline_generate1("enc");	}
    502 	else
    503 	{   &call	("_aesni_encrypt1");	}
    504 	&movups	(&QWP(0,$out),$inout0);
    505 	&jmp	(&label("ecb_ret"));
    506 
    507 &set_label("ecb_enc_two",16);
    508 	&call	("_aesni_encrypt2");
    509 	&movups	(&QWP(0,$out),$inout0);
    510 	&movups	(&QWP(0x10,$out),$inout1);
    511 	&jmp	(&label("ecb_ret"));
    512 
    513 &set_label("ecb_enc_three",16);
    514 	&call	("_aesni_encrypt3");
    515 	&movups	(&QWP(0,$out),$inout0);
    516 	&movups	(&QWP(0x10,$out),$inout1);
    517 	&movups	(&QWP(0x20,$out),$inout2);
    518 	&jmp	(&label("ecb_ret"));
    519 
    520 &set_label("ecb_enc_four",16);
    521 	&call	("_aesni_encrypt4");
    522 	&movups	(&QWP(0,$out),$inout0);
    523 	&movups	(&QWP(0x10,$out),$inout1);
    524 	&movups	(&QWP(0x20,$out),$inout2);
    525 	&movups	(&QWP(0x30,$out),$inout3);
    526 	&jmp	(&label("ecb_ret"));
    527 ######################################################################
    528 &set_label("ecb_decrypt",16);
    529 	&mov	($key_,$key);		# backup $key
    530 	&mov	($rounds_,$rounds);	# backup $rounds
    531 	&cmp	($len,0x60);
    532 	&jb	(&label("ecb_dec_tail"));
    533 
    534 	&movdqu	($inout0,&QWP(0,$inp));
    535 	&movdqu	($inout1,&QWP(0x10,$inp));
    536 	&movdqu	($inout2,&QWP(0x20,$inp));
    537 	&movdqu	($inout3,&QWP(0x30,$inp));
    538 	&movdqu	($inout4,&QWP(0x40,$inp));
    539 	&movdqu	($inout5,&QWP(0x50,$inp));
    540 	&lea	($inp,&DWP(0x60,$inp));
    541 	&sub	($len,0x60);
    542 	&jmp	(&label("ecb_dec_loop6_enter"));
    543 
    544 &set_label("ecb_dec_loop6",16);
    545 	&movups	(&QWP(0,$out),$inout0);
    546 	&movdqu	($inout0,&QWP(0,$inp));
    547 	&movups	(&QWP(0x10,$out),$inout1);
    548 	&movdqu	($inout1,&QWP(0x10,$inp));
    549 	&movups	(&QWP(0x20,$out),$inout2);
    550 	&movdqu	($inout2,&QWP(0x20,$inp));
    551 	&movups	(&QWP(0x30,$out),$inout3);
    552 	&movdqu	($inout3,&QWP(0x30,$inp));
    553 	&movups	(&QWP(0x40,$out),$inout4);
    554 	&movdqu	($inout4,&QWP(0x40,$inp));
    555 	&movups	(&QWP(0x50,$out),$inout5);
    556 	&lea	($out,&DWP(0x60,$out));
    557 	&movdqu	($inout5,&QWP(0x50,$inp));
    558 	&lea	($inp,&DWP(0x60,$inp));
    559 &set_label("ecb_dec_loop6_enter");
    560 
    561 	&call	("_aesni_decrypt6");
    562 
    563 	&mov	($key,$key_);		# restore $key
    564 	&mov	($rounds,$rounds_);	# restore $rounds
    565 	&sub	($len,0x60);
    566 	&jnc	(&label("ecb_dec_loop6"));
    567 
    568 	&movups	(&QWP(0,$out),$inout0);
    569 	&movups	(&QWP(0x10,$out),$inout1);
    570 	&movups	(&QWP(0x20,$out),$inout2);
    571 	&movups	(&QWP(0x30,$out),$inout3);
    572 	&movups	(&QWP(0x40,$out),$inout4);
    573 	&movups	(&QWP(0x50,$out),$inout5);
    574 	&lea	($out,&DWP(0x60,$out));
    575 	&add	($len,0x60);
    576 	&jz	(&label("ecb_ret"));
    577 
    578 &set_label("ecb_dec_tail");
    579 	&movups	($inout0,&QWP(0,$inp));
    580 	&cmp	($len,0x20);
    581 	&jb	(&label("ecb_dec_one"));
    582 	&movups	($inout1,&QWP(0x10,$inp));
    583 	&je	(&label("ecb_dec_two"));
    584 	&movups	($inout2,&QWP(0x20,$inp));
    585 	&cmp	($len,0x40);
    586 	&jb	(&label("ecb_dec_three"));
    587 	&movups	($inout3,&QWP(0x30,$inp));
    588 	&je	(&label("ecb_dec_four"));
    589 	&movups	($inout4,&QWP(0x40,$inp));
    590 	&xorps	($inout5,$inout5);
    591 	&call	("_aesni_decrypt6");
    592 	&movups	(&QWP(0,$out),$inout0);
    593 	&movups	(&QWP(0x10,$out),$inout1);
    594 	&movups	(&QWP(0x20,$out),$inout2);
    595 	&movups	(&QWP(0x30,$out),$inout3);
    596 	&movups	(&QWP(0x40,$out),$inout4);
    597 	&jmp	(&label("ecb_ret"));
    598 
    599 &set_label("ecb_dec_one",16);
    600 	if ($inline)
    601 	{   &aesni_inline_generate1("dec");	}
    602 	else
    603 	{   &call	("_aesni_decrypt1");	}
    604 	&movups	(&QWP(0,$out),$inout0);
    605 	&jmp	(&label("ecb_ret"));
    606 
    607 &set_label("ecb_dec_two",16);
    608 	&call	("_aesni_decrypt2");
    609 	&movups	(&QWP(0,$out),$inout0);
    610 	&movups	(&QWP(0x10,$out),$inout1);
    611 	&jmp	(&label("ecb_ret"));
    612 
    613 &set_label("ecb_dec_three",16);
    614 	&call	("_aesni_decrypt3");
    615 	&movups	(&QWP(0,$out),$inout0);
    616 	&movups	(&QWP(0x10,$out),$inout1);
    617 	&movups	(&QWP(0x20,$out),$inout2);
    618 	&jmp	(&label("ecb_ret"));
    619 
    620 &set_label("ecb_dec_four",16);
    621 	&call	("_aesni_decrypt4");
    622 	&movups	(&QWP(0,$out),$inout0);
    623 	&movups	(&QWP(0x10,$out),$inout1);
    624 	&movups	(&QWP(0x20,$out),$inout2);
    625 	&movups	(&QWP(0x30,$out),$inout3);
    626 
    627 &set_label("ecb_ret");
    628 	&pxor	("xmm0","xmm0");		# clear register bank
    629 	&pxor	("xmm1","xmm1");
    630 	&pxor	("xmm2","xmm2");
    631 	&pxor	("xmm3","xmm3");
    632 	&pxor	("xmm4","xmm4");
    633 	&pxor	("xmm5","xmm5");
    634 	&pxor	("xmm6","xmm6");
    635 	&pxor	("xmm7","xmm7");
    636 &function_end("aesni_ecb_encrypt");
    637 
    639 ######################################################################
    640 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
    641 #                         size_t blocks, const AES_KEY *key,
    642 #                         const char *ivec,char *cmac);
    643 #
    644 # Handles only complete blocks, operates on 64-bit counter and
    645 # does not update *ivec! Nor does it finalize CMAC value
    646 # (see engine/eng_aesni.c for details)
    647 #
    648 { my $cmac=$inout1;
    649 &function_begin("aesni_ccm64_encrypt_blocks");
    650 	&mov	($inp,&wparam(0));
    651 	&mov	($out,&wparam(1));
    652 	&mov	($len,&wparam(2));
    653 	&mov	($key,&wparam(3));
    654 	&mov	($rounds_,&wparam(4));
    655 	&mov	($rounds,&wparam(5));
    656 	&mov	($key_,"esp");
    657 	&sub	("esp",60);
    658 	&and	("esp",-16);			# align stack
    659 	&mov	(&DWP(48,"esp"),$key_);
    660 
    661 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
    662 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
    663 	&mov	($rounds,&DWP(240,$key));
    664 
    665 	# compose byte-swap control mask for pshufb on stack
    666 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    667 	&mov	(&DWP(4,"esp"),0x08090a0b);
    668 	&mov	(&DWP(8,"esp"),0x04050607);
    669 	&mov	(&DWP(12,"esp"),0x00010203);
    670 
    671 	# compose counter increment vector on stack
    672 	&mov	($rounds_,1);
    673 	&xor	($key_,$key_);
    674 	&mov	(&DWP(16,"esp"),$rounds_);
    675 	&mov	(&DWP(20,"esp"),$key_);
    676 	&mov	(&DWP(24,"esp"),$key_);
    677 	&mov	(&DWP(28,"esp"),$key_);
    678 
    679 	&shl	($rounds,4);
    680 	&mov	($rounds_,16);
    681 	&lea	($key_,&DWP(0,$key));
    682 	&movdqa	($inout3,&QWP(0,"esp"));
    683 	&movdqa	($inout0,$ivec);
    684 	&lea	($key,&DWP(32,$key,$rounds));
    685 	&sub	($rounds_,$rounds);
    686 	&pshufb	($ivec,$inout3);
    687 
    688 &set_label("ccm64_enc_outer");
    689 	&$movekey	($rndkey0,&QWP(0,$key_));
    690 	&mov		($rounds,$rounds_);
    691 	&movups		($in0,&QWP(0,$inp));
    692 
    693 	&xorps		($inout0,$rndkey0);
    694 	&$movekey	($rndkey1,&QWP(16,$key_));
    695 	&xorps		($rndkey0,$in0);
    696 	&xorps		($cmac,$rndkey0);		# cmac^=inp
    697 	&$movekey	($rndkey0,&QWP(32,$key_));
    698 
    699 &set_label("ccm64_enc2_loop");
    700 	&aesenc		($inout0,$rndkey1);
    701 	&aesenc		($cmac,$rndkey1);
    702 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    703 	&add		($rounds,32);
    704 	&aesenc		($inout0,$rndkey0);
    705 	&aesenc		($cmac,$rndkey0);
    706 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    707 	&jnz		(&label("ccm64_enc2_loop"));
    708 	&aesenc		($inout0,$rndkey1);
    709 	&aesenc		($cmac,$rndkey1);
    710 	&paddq		($ivec,&QWP(16,"esp"));
    711 	&dec		($len);
    712 	&aesenclast	($inout0,$rndkey0);
    713 	&aesenclast	($cmac,$rndkey0);
    714 
    715 	&lea	($inp,&DWP(16,$inp));
    716 	&xorps	($in0,$inout0);			# inp^=E(ivec)
    717 	&movdqa	($inout0,$ivec);
    718 	&movups	(&QWP(0,$out),$in0);		# save output
    719 	&pshufb	($inout0,$inout3);
    720 	&lea	($out,&DWP(16,$out));
    721 	&jnz	(&label("ccm64_enc_outer"));
    722 
    723 	&mov	("esp",&DWP(48,"esp"));
    724 	&mov	($out,&wparam(5));
    725 	&movups	(&QWP(0,$out),$cmac);
    726 
    727 	&pxor	("xmm0","xmm0");		# clear register bank
    728 	&pxor	("xmm1","xmm1");
    729 	&pxor	("xmm2","xmm2");
    730 	&pxor	("xmm3","xmm3");
    731 	&pxor	("xmm4","xmm4");
    732 	&pxor	("xmm5","xmm5");
    733 	&pxor	("xmm6","xmm6");
    734 	&pxor	("xmm7","xmm7");
    735 &function_end("aesni_ccm64_encrypt_blocks");
    736 
    737 &function_begin("aesni_ccm64_decrypt_blocks");
    738 	&mov	($inp,&wparam(0));
    739 	&mov	($out,&wparam(1));
    740 	&mov	($len,&wparam(2));
    741 	&mov	($key,&wparam(3));
    742 	&mov	($rounds_,&wparam(4));
    743 	&mov	($rounds,&wparam(5));
    744 	&mov	($key_,"esp");
    745 	&sub	("esp",60);
    746 	&and	("esp",-16);			# align stack
    747 	&mov	(&DWP(48,"esp"),$key_);
    748 
    749 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
    750 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
    751 	&mov	($rounds,&DWP(240,$key));
    752 
    753 	# compose byte-swap control mask for pshufb on stack
    754 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    755 	&mov	(&DWP(4,"esp"),0x08090a0b);
    756 	&mov	(&DWP(8,"esp"),0x04050607);
    757 	&mov	(&DWP(12,"esp"),0x00010203);
    758 
    759 	# compose counter increment vector on stack
    760 	&mov	($rounds_,1);
    761 	&xor	($key_,$key_);
    762 	&mov	(&DWP(16,"esp"),$rounds_);
    763 	&mov	(&DWP(20,"esp"),$key_);
    764 	&mov	(&DWP(24,"esp"),$key_);
    765 	&mov	(&DWP(28,"esp"),$key_);
    766 
    767 	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
    768 	&movdqa	($inout0,$ivec);
    769 
    770 	&mov	($key_,$key);
    771 	&mov	($rounds_,$rounds);
    772 
    773 	&pshufb	($ivec,$inout3);
    774 	if ($inline)
    775 	{   &aesni_inline_generate1("enc");	}
    776 	else
    777 	{   &call	("_aesni_encrypt1");	}
    778 	&shl	($rounds_,4);
    779 	&mov	($rounds,16);
    780 	&movups	($in0,&QWP(0,$inp));		# load inp
    781 	&paddq	($ivec,&QWP(16,"esp"));
    782 	&lea	($inp,&QWP(16,$inp));
    783 	&sub	($rounds,$rounds_);
    784 	&lea	($key,&DWP(32,$key_,$rounds_));
    785 	&mov	($rounds_,$rounds);
    786 	&jmp	(&label("ccm64_dec_outer"));
    787 
    788 &set_label("ccm64_dec_outer",16);
    789 	&xorps	($in0,$inout0);			# inp ^= E(ivec)
    790 	&movdqa	($inout0,$ivec);
    791 	&movups	(&QWP(0,$out),$in0);		# save output
    792 	&lea	($out,&DWP(16,$out));
    793 	&pshufb	($inout0,$inout3);
    794 
    795 	&sub	($len,1);
    796 	&jz	(&label("ccm64_dec_break"));
    797 
    798 	&$movekey	($rndkey0,&QWP(0,$key_));
    799 	&mov		($rounds,$rounds_);
    800 	&$movekey	($rndkey1,&QWP(16,$key_));
    801 	&xorps		($in0,$rndkey0);
    802 	&xorps		($inout0,$rndkey0);
    803 	&xorps		($cmac,$in0);		# cmac^=out
    804 	&$movekey	($rndkey0,&QWP(32,$key_));
    805 
    806 &set_label("ccm64_dec2_loop");
    807 	&aesenc		($inout0,$rndkey1);
    808 	&aesenc		($cmac,$rndkey1);
    809 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    810 	&add		($rounds,32);
    811 	&aesenc		($inout0,$rndkey0);
    812 	&aesenc		($cmac,$rndkey0);
    813 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    814 	&jnz		(&label("ccm64_dec2_loop"));
    815 	&movups		($in0,&QWP(0,$inp));	# load inp
    816 	&paddq		($ivec,&QWP(16,"esp"));
    817 	&aesenc		($inout0,$rndkey1);
    818 	&aesenc		($cmac,$rndkey1);
    819 	&aesenclast	($inout0,$rndkey0);
    820 	&aesenclast	($cmac,$rndkey0);
    821 	&lea		($inp,&QWP(16,$inp));
    822 	&jmp	(&label("ccm64_dec_outer"));
    823 
    824 &set_label("ccm64_dec_break",16);
    825 	&mov	($rounds,&DWP(240,$key_));
    826 	&mov	($key,$key_);
    827 	if ($inline)
    828 	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
    829 	else
    830 	{   &call	("_aesni_encrypt1",$cmac);	}
    831 
    832 	&mov	("esp",&DWP(48,"esp"));
    833 	&mov	($out,&wparam(5));
    834 	&movups	(&QWP(0,$out),$cmac);
    835 
    836 	&pxor	("xmm0","xmm0");		# clear register bank
    837 	&pxor	("xmm1","xmm1");
    838 	&pxor	("xmm2","xmm2");
    839 	&pxor	("xmm3","xmm3");
    840 	&pxor	("xmm4","xmm4");
    841 	&pxor	("xmm5","xmm5");
    842 	&pxor	("xmm6","xmm6");
    843 	&pxor	("xmm7","xmm7");
    844 &function_end("aesni_ccm64_decrypt_blocks");
    845 }
    846 
    848 ######################################################################
    849 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
    850 #                         size_t blocks, const AES_KEY *key,
    851 #                         const char *ivec);
    852 #
    853 # Handles only complete blocks, operates on 32-bit counter and
    854 # does not update *ivec! (see crypto/modes/ctr128.c for details)
    855 #
    856 # stack layout:
    857 #	0	pshufb mask
    858 #	16	vector addend: 0,6,6,6
    859 # 	32	counter-less ivec
    860 #	48	1st triplet of counter vector
    861 #	64	2nd triplet of counter vector
    862 #	80	saved %esp
    863 
    864 &function_begin("aesni_ctr32_encrypt_blocks");
    865 	&mov	($inp,&wparam(0));
    866 	&mov	($out,&wparam(1));
    867 	&mov	($len,&wparam(2));
    868 	&mov	($key,&wparam(3));
    869 	&mov	($rounds_,&wparam(4));
    870 	&mov	($key_,"esp");
    871 	&sub	("esp",88);
    872 	&and	("esp",-16);			# align stack
    873 	&mov	(&DWP(80,"esp"),$key_);
    874 
    875 	&cmp	($len,1);
    876 	&je	(&label("ctr32_one_shortcut"));
    877 
    878 	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
    879 
    880 	# compose byte-swap control mask for pshufb on stack
    881 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    882 	&mov	(&DWP(4,"esp"),0x08090a0b);
    883 	&mov	(&DWP(8,"esp"),0x04050607);
    884 	&mov	(&DWP(12,"esp"),0x00010203);
    885 
    886 	# compose counter increment vector on stack
    887 	&mov	($rounds,6);
    888 	&xor	($key_,$key_);
    889 	&mov	(&DWP(16,"esp"),$rounds);
    890 	&mov	(&DWP(20,"esp"),$rounds);
    891 	&mov	(&DWP(24,"esp"),$rounds);
    892 	&mov	(&DWP(28,"esp"),$key_);
    893 
    894 	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
    895 	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
    896 
    897 	&mov	($rounds,&DWP(240,$key));	# key->rounds
    898 
    899 	# compose 2 vectors of 3x32-bit counters
    900 	&bswap	($rounds_);
    901 	&pxor	($rndkey0,$rndkey0);
    902 	&pxor	($rndkey1,$rndkey1);
    903 	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
    904 	&pinsrd	($rndkey0,$rounds_,0);
    905 	&lea	($key_,&DWP(3,$rounds_));
    906 	&pinsrd	($rndkey1,$key_,0);
    907 	&inc	($rounds_);
    908 	&pinsrd	($rndkey0,$rounds_,1);
    909 	&inc	($key_);
    910 	&pinsrd	($rndkey1,$key_,1);
    911 	&inc	($rounds_);
    912 	&pinsrd	($rndkey0,$rounds_,2);
    913 	&inc	($key_);
    914 	&pinsrd	($rndkey1,$key_,2);
    915 	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
    916 	&pshufb	($rndkey0,$inout0);		# byte swap
    917 	&movdqu	($inout4,&QWP(0,$key));		# key[0]
    918 	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
    919 	&pshufb	($rndkey1,$inout0);		# byte swap
    920 
    921 	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
    922 	&pshufd	($inout1,$rndkey0,2<<6);
    923 	&cmp	($len,6);
    924 	&jb	(&label("ctr32_tail"));
    925 	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
    926 	&shl	($rounds,4);
    927 	&mov	($rounds_,16);
    928 	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
    929 	&mov	($key_,$key);			# backup $key
    930 	&sub	($rounds_,$rounds);		# backup twisted $rounds
    931 	&lea	($key,&DWP(32,$key,$rounds));
    932 	&sub	($len,6);
    933 	&jmp	(&label("ctr32_loop6"));
    934 
    935 &set_label("ctr32_loop6",16);
    936 	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
    937 	&pshufd	($inout2,$rndkey0,1<<6);
    938 	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
    939 	&pshufd	($inout3,$rndkey1,3<<6);
    940 	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
    941 	&pshufd	($inout4,$rndkey1,2<<6);
    942 	&pxor		($inout1,$rndkey0);
    943 	&pshufd	($inout5,$rndkey1,1<<6);
    944 	&$movekey	($rndkey1,&QWP(16,$key_));
    945 	&pxor		($inout2,$rndkey0);
    946 	&pxor		($inout3,$rndkey0);
    947 	&aesenc		($inout0,$rndkey1);
    948 	&pxor		($inout4,$rndkey0);
    949 	&pxor		($inout5,$rndkey0);
    950 	&aesenc		($inout1,$rndkey1);
    951 	&$movekey	($rndkey0,&QWP(32,$key_));
    952 	&mov		($rounds,$rounds_);
    953 	&aesenc		($inout2,$rndkey1);
    954 	&aesenc		($inout3,$rndkey1);
    955 	&aesenc		($inout4,$rndkey1);
    956 	&aesenc		($inout5,$rndkey1);
    957 
    958 	&call		(&label("_aesni_encrypt6_enter"));
    959 
    960 	&movups	($rndkey1,&QWP(0,$inp));
    961 	&movups	($rndkey0,&QWP(0x10,$inp));
    962 	&xorps	($inout0,$rndkey1);
    963 	&movups	($rndkey1,&QWP(0x20,$inp));
    964 	&xorps	($inout1,$rndkey0);
    965 	&movups	(&QWP(0,$out),$inout0);
    966 	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
    967 	&xorps	($inout2,$rndkey1);
    968 	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
    969 	&movups	(&QWP(0x10,$out),$inout1);
    970 	&movups	(&QWP(0x20,$out),$inout2);
    971 
    972 	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
    973 	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
    974 	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
    975 
    976 	&movups	($inout1,&QWP(0x30,$inp));
    977 	&movups	($inout2,&QWP(0x40,$inp));
    978 	&xorps	($inout3,$inout1);
    979 	&movups	($inout1,&QWP(0x50,$inp));
    980 	&lea	($inp,&DWP(0x60,$inp));
    981 	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
    982 	&pshufb	($rndkey0,$inout0);		# byte swap
    983 	&xorps	($inout4,$inout2);
    984 	&movups	(&QWP(0x30,$out),$inout3);
    985 	&xorps	($inout5,$inout1);
    986 	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
    987 	&pshufb	($rndkey1,$inout0);		# byte swap
    988 	&movups	(&QWP(0x40,$out),$inout4);
    989 	&pshufd	($inout0,$rndkey0,3<<6);
    990 	&movups	(&QWP(0x50,$out),$inout5);
    991 	&lea	($out,&DWP(0x60,$out));
    992 
    993 	&pshufd	($inout1,$rndkey0,2<<6);
    994 	&sub	($len,6);
    995 	&jnc	(&label("ctr32_loop6"));
    996 
    997 	&add	($len,6);
    998 	&jz	(&label("ctr32_ret"));
    999 	&movdqu	($inout5,&QWP(0,$key_));
   1000 	&mov	($key,$key_);
   1001 	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
   1002 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1003 
   1004 &set_label("ctr32_tail");
   1005 	&por	($inout0,$inout5);
   1006 	&cmp	($len,2);
   1007 	&jb	(&label("ctr32_one"));
   1008 
   1009 	&pshufd	($inout2,$rndkey0,1<<6);
   1010 	&por	($inout1,$inout5);
   1011 	&je	(&label("ctr32_two"));
   1012 
   1013 	&pshufd	($inout3,$rndkey1,3<<6);
   1014 	&por	($inout2,$inout5);
   1015 	&cmp	($len,4);
   1016 	&jb	(&label("ctr32_three"));
   1017 
   1018 	&pshufd	($inout4,$rndkey1,2<<6);
   1019 	&por	($inout3,$inout5);
   1020 	&je	(&label("ctr32_four"));
   1021 
   1022 	&por	($inout4,$inout5);
   1023 	&call	("_aesni_encrypt6");
   1024 	&movups	($rndkey1,&QWP(0,$inp));
   1025 	&movups	($rndkey0,&QWP(0x10,$inp));
   1026 	&xorps	($inout0,$rndkey1);
   1027 	&movups	($rndkey1,&QWP(0x20,$inp));
   1028 	&xorps	($inout1,$rndkey0);
   1029 	&movups	($rndkey0,&QWP(0x30,$inp));
   1030 	&xorps	($inout2,$rndkey1);
   1031 	&movups	($rndkey1,&QWP(0x40,$inp));
   1032 	&xorps	($inout3,$rndkey0);
   1033 	&movups	(&QWP(0,$out),$inout0);
   1034 	&xorps	($inout4,$rndkey1);
   1035 	&movups	(&QWP(0x10,$out),$inout1);
   1036 	&movups	(&QWP(0x20,$out),$inout2);
   1037 	&movups	(&QWP(0x30,$out),$inout3);
   1038 	&movups	(&QWP(0x40,$out),$inout4);
   1039 	&jmp	(&label("ctr32_ret"));
   1040 
   1041 &set_label("ctr32_one_shortcut",16);
   1042 	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
   1043 	&mov	($rounds,&DWP(240,$key));
   1044 	
   1045 &set_label("ctr32_one");
   1046 	if ($inline)
   1047 	{   &aesni_inline_generate1("enc");	}
   1048 	else
   1049 	{   &call	("_aesni_encrypt1");	}
   1050 	&movups	($in0,&QWP(0,$inp));
   1051 	&xorps	($in0,$inout0);
   1052 	&movups	(&QWP(0,$out),$in0);
   1053 	&jmp	(&label("ctr32_ret"));
   1054 
   1055 &set_label("ctr32_two",16);
   1056 	&call	("_aesni_encrypt2");
   1057 	&movups	($inout3,&QWP(0,$inp));
   1058 	&movups	($inout4,&QWP(0x10,$inp));
   1059 	&xorps	($inout0,$inout3);
   1060 	&xorps	($inout1,$inout4);
   1061 	&movups	(&QWP(0,$out),$inout0);
   1062 	&movups	(&QWP(0x10,$out),$inout1);
   1063 	&jmp	(&label("ctr32_ret"));
   1064 
   1065 &set_label("ctr32_three",16);
   1066 	&call	("_aesni_encrypt3");
   1067 	&movups	($inout3,&QWP(0,$inp));
   1068 	&movups	($inout4,&QWP(0x10,$inp));
   1069 	&xorps	($inout0,$inout3);
   1070 	&movups	($inout5,&QWP(0x20,$inp));
   1071 	&xorps	($inout1,$inout4);
   1072 	&movups	(&QWP(0,$out),$inout0);
   1073 	&xorps	($inout2,$inout5);
   1074 	&movups	(&QWP(0x10,$out),$inout1);
   1075 	&movups	(&QWP(0x20,$out),$inout2);
   1076 	&jmp	(&label("ctr32_ret"));
   1077 
   1078 &set_label("ctr32_four",16);
   1079 	&call	("_aesni_encrypt4");
   1080 	&movups	($inout4,&QWP(0,$inp));
   1081 	&movups	($inout5,&QWP(0x10,$inp));
   1082 	&movups	($rndkey1,&QWP(0x20,$inp));
   1083 	&xorps	($inout0,$inout4);
   1084 	&movups	($rndkey0,&QWP(0x30,$inp));
   1085 	&xorps	($inout1,$inout5);
   1086 	&movups	(&QWP(0,$out),$inout0);
   1087 	&xorps	($inout2,$rndkey1);
   1088 	&movups	(&QWP(0x10,$out),$inout1);
   1089 	&xorps	($inout3,$rndkey0);
   1090 	&movups	(&QWP(0x20,$out),$inout2);
   1091 	&movups	(&QWP(0x30,$out),$inout3);
   1092 
   1093 &set_label("ctr32_ret");
   1094 	&pxor	("xmm0","xmm0");		# clear register bank
   1095 	&pxor	("xmm1","xmm1");
   1096 	&pxor	("xmm2","xmm2");
   1097 	&pxor	("xmm3","xmm3");
   1098 	&pxor	("xmm4","xmm4");
   1099 	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
   1100 	&pxor	("xmm5","xmm5");
   1101 	&movdqa	(&QWP(48,"esp"),"xmm0");
   1102 	&pxor	("xmm6","xmm6");
   1103 	&movdqa	(&QWP(64,"esp"),"xmm0");
   1104 	&pxor	("xmm7","xmm7");
   1105 	&mov	("esp",&DWP(80,"esp"));
   1106 &function_end("aesni_ctr32_encrypt_blocks");
   1107 
   1109 ######################################################################
   1110 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
   1111 #	const AES_KEY *key1, const AES_KEY *key2
   1112 #	const unsigned char iv[16]);
   1113 #
   1114 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
   1115 
   1116 &function_begin("aesni_xts_encrypt");
   1117 	&mov	($key,&wparam(4));		# key2
   1118 	&mov	($inp,&wparam(5));		# clear-text tweak
   1119 
   1120 	&mov	($rounds,&DWP(240,$key));	# key2->rounds
   1121 	&movups	($inout0,&QWP(0,$inp));
   1122 	if ($inline)
   1123 	{   &aesni_inline_generate1("enc");	}
   1124 	else
   1125 	{   &call	("_aesni_encrypt1");	}
   1126 
   1127 	&mov	($inp,&wparam(0));
   1128 	&mov	($out,&wparam(1));
   1129 	&mov	($len,&wparam(2));
   1130 	&mov	($key,&wparam(3));		# key1
   1131 
   1132 	&mov	($key_,"esp");
   1133 	&sub	("esp",16*7+8);
   1134 	&mov	($rounds,&DWP(240,$key));	# key1->rounds
   1135 	&and	("esp",-16);			# align stack
   1136 
   1137 	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
   1138 	&mov	(&DWP(16*6+4,"esp"),0);
   1139 	&mov	(&DWP(16*6+8,"esp"),1);
   1140 	&mov	(&DWP(16*6+12,"esp"),0);
   1141 	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
   1142 	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
   1143 
   1144 	&movdqa	($tweak,$inout0);
   1145 	&pxor	($twtmp,$twtmp);
   1146 	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
   1147 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1148 
   1149 	&and	($len,-16);
   1150 	&mov	($key_,$key);			# backup $key
   1151 	&mov	($rounds_,$rounds);		# backup $rounds
   1152 	&sub	($len,16*6);
   1153 	&jc	(&label("xts_enc_short"));
   1154 
   1155 	&shl	($rounds,4);
   1156 	&mov	($rounds_,16);
   1157 	&sub	($rounds_,$rounds);
   1158 	&lea	($key,&DWP(32,$key,$rounds));
   1159 	&jmp	(&label("xts_enc_loop6"));
   1160 
   1161 &set_label("xts_enc_loop6",16);
   1162 	for ($i=0;$i<4;$i++) {
   1163 	    &pshufd	($twres,$twtmp,0x13);
   1164 	    &pxor	($twtmp,$twtmp);
   1165 	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
   1166 	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
   1167 	    &pand	($twres,$twmask);	# isolate carry and residue
   1168 	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
   1169 	    &pxor	($tweak,$twres);
   1170 	}
   1171 	&pshufd	($inout5,$twtmp,0x13);
   1172 	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
   1173 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1174 	 &$movekey	($rndkey0,&QWP(0,$key_));
   1175 	&pand	($inout5,$twmask);		# isolate carry and residue
   1176 	 &movups	($inout0,&QWP(0,$inp));	# load input
   1177 	&pxor	($inout5,$tweak);
   1178 
   1179 	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
   1180 	&mov	($rounds,$rounds_);		# restore $rounds
   1181 	&movdqu	($inout1,&QWP(16*1,$inp));
   1182 	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
   1183 	&movdqu	($inout2,&QWP(16*2,$inp));
   1184 	 &pxor		($inout1,$rndkey0);
   1185 	&movdqu	($inout3,&QWP(16*3,$inp));
   1186 	 &pxor		($inout2,$rndkey0);
   1187 	&movdqu	($inout4,&QWP(16*4,$inp));
   1188 	 &pxor		($inout3,$rndkey0);
   1189 	&movdqu	($rndkey1,&QWP(16*5,$inp));
   1190 	 &pxor		($inout4,$rndkey0);
   1191 	&lea	($inp,&DWP(16*6,$inp));
   1192 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1193 	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
   1194 	&pxor	($inout5,$rndkey1);
   1195 
   1196 	 &$movekey	($rndkey1,&QWP(16,$key_));
   1197 	&pxor	($inout1,&QWP(16*1,"esp"));
   1198 	&pxor	($inout2,&QWP(16*2,"esp"));
   1199 	 &aesenc	($inout0,$rndkey1);
   1200 	&pxor	($inout3,&QWP(16*3,"esp"));
   1201 	&pxor	($inout4,&QWP(16*4,"esp"));
   1202 	 &aesenc	($inout1,$rndkey1);
   1203 	&pxor		($inout5,$rndkey0);
   1204 	 &$movekey	($rndkey0,&QWP(32,$key_));
   1205 	 &aesenc	($inout2,$rndkey1);
   1206 	 &aesenc	($inout3,$rndkey1);
   1207 	 &aesenc	($inout4,$rndkey1);
   1208 	 &aesenc	($inout5,$rndkey1);
   1209 	&call		(&label("_aesni_encrypt6_enter"));
   1210 
   1211 	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
   1212        &pxor	($twtmp,$twtmp);
   1213 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1214        &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
   1215 	&xorps	($inout1,&QWP(16*1,"esp"));
   1216 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1217 	&xorps	($inout2,&QWP(16*2,"esp"));
   1218 	&movups	(&QWP(16*1,$out),$inout1);
   1219 	&xorps	($inout3,&QWP(16*3,"esp"));
   1220 	&movups	(&QWP(16*2,$out),$inout2);
   1221 	&xorps	($inout4,&QWP(16*4,"esp"));
   1222 	&movups	(&QWP(16*3,$out),$inout3);
   1223 	&xorps	($inout5,$tweak);
   1224 	&movups	(&QWP(16*4,$out),$inout4);
   1225        &pshufd	($twres,$twtmp,0x13);
   1226 	&movups	(&QWP(16*5,$out),$inout5);
   1227 	&lea	($out,&DWP(16*6,$out));
   1228        &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
   1229 
   1230 	&pxor	($twtmp,$twtmp);
   1231 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1232 	&pand	($twres,$twmask);		# isolate carry and residue
   1233 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1234 	&pxor	($tweak,$twres);
   1235 
   1236 	&sub	($len,16*6);
   1237 	&jnc	(&label("xts_enc_loop6"));
   1238 
   1239 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1240 	&mov	($key,$key_);			# restore $key
   1241 	&mov	($rounds_,$rounds);
   1242 
   1243 &set_label("xts_enc_short");
   1244 	&add	($len,16*6);
   1245 	&jz	(&label("xts_enc_done6x"));
   1246 
   1247 	&movdqa	($inout3,$tweak);		# put aside previous tweak
   1248 	&cmp	($len,0x20);
   1249 	&jb	(&label("xts_enc_one"));
   1250 
   1251 	&pshufd	($twres,$twtmp,0x13);
   1252 	&pxor	($twtmp,$twtmp);
   1253 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1254 	&pand	($twres,$twmask);		# isolate carry and residue
   1255 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1256 	&pxor	($tweak,$twres);
   1257 	&je	(&label("xts_enc_two"));
   1258 
   1259 	&pshufd	($twres,$twtmp,0x13);
   1260 	&pxor	($twtmp,$twtmp);
   1261 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1262 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1263 	&pand	($twres,$twmask);		# isolate carry and residue
   1264 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1265 	&pxor	($tweak,$twres);
   1266 	&cmp	($len,0x40);
   1267 	&jb	(&label("xts_enc_three"));
   1268 
   1269 	&pshufd	($twres,$twtmp,0x13);
   1270 	&pxor	($twtmp,$twtmp);
   1271 	&movdqa	($inout5,$tweak);		# put aside previous tweak
   1272 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1273 	&pand	($twres,$twmask);		# isolate carry and residue
   1274 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1275 	&pxor	($tweak,$twres);
   1276 	&movdqa	(&QWP(16*0,"esp"),$inout3);
   1277 	&movdqa	(&QWP(16*1,"esp"),$inout4);
   1278 	&je	(&label("xts_enc_four"));
   1279 
   1280 	&movdqa	(&QWP(16*2,"esp"),$inout5);
   1281 	&pshufd	($inout5,$twtmp,0x13);
   1282 	&movdqa	(&QWP(16*3,"esp"),$tweak);
   1283 	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
   1284 	&pand	($inout5,$twmask);		# isolate carry and residue
   1285 	&pxor	($inout5,$tweak);
   1286 
   1287 	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
   1288 	&movdqu	($inout1,&QWP(16*1,$inp));
   1289 	&movdqu	($inout2,&QWP(16*2,$inp));
   1290 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1291 	&movdqu	($inout3,&QWP(16*3,$inp));
   1292 	&pxor	($inout1,&QWP(16*1,"esp"));
   1293 	&movdqu	($inout4,&QWP(16*4,$inp));
   1294 	&pxor	($inout2,&QWP(16*2,"esp"));
   1295 	&lea	($inp,&DWP(16*5,$inp));
   1296 	&pxor	($inout3,&QWP(16*3,"esp"));
   1297 	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
   1298 	&pxor	($inout4,$inout5);
   1299 
   1300 	&call	("_aesni_encrypt6");
   1301 
   1302 	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
   1303 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1304 	&xorps	($inout1,&QWP(16*1,"esp"));
   1305 	&xorps	($inout2,&QWP(16*2,"esp"));
   1306 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1307 	&xorps	($inout3,&QWP(16*3,"esp"));
   1308 	&movups	(&QWP(16*1,$out),$inout1);
   1309 	&xorps	($inout4,$tweak);
   1310 	&movups	(&QWP(16*2,$out),$inout2);
   1311 	&movups	(&QWP(16*3,$out),$inout3);
   1312 	&movups	(&QWP(16*4,$out),$inout4);
   1313 	&lea	($out,&DWP(16*5,$out));
   1314 	&jmp	(&label("xts_enc_done"));
   1315 
   1316 &set_label("xts_enc_one",16);
   1317 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1318 	&lea	($inp,&DWP(16*1,$inp));
   1319 	&xorps	($inout0,$inout3);		# input^=tweak
   1320 	if ($inline)
   1321 	{   &aesni_inline_generate1("enc");	}
   1322 	else
   1323 	{   &call	("_aesni_encrypt1");	}
   1324 	&xorps	($inout0,$inout3);		# output^=tweak
   1325 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1326 	&lea	($out,&DWP(16*1,$out));
   1327 
   1328 	&movdqa	($tweak,$inout3);		# last tweak
   1329 	&jmp	(&label("xts_enc_done"));
   1330 
   1331 &set_label("xts_enc_two",16);
   1332 	&movaps	($inout4,$tweak);		# put aside last tweak
   1333 
   1334 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1335 	&movups	($inout1,&QWP(16*1,$inp));
   1336 	&lea	($inp,&DWP(16*2,$inp));
   1337 	&xorps	($inout0,$inout3);		# input^=tweak
   1338 	&xorps	($inout1,$inout4);
   1339 
   1340 	&call	("_aesni_encrypt2");
   1341 
   1342 	&xorps	($inout0,$inout3);		# output^=tweak
   1343 	&xorps	($inout1,$inout4);
   1344 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1345 	&movups	(&QWP(16*1,$out),$inout1);
   1346 	&lea	($out,&DWP(16*2,$out));
   1347 
   1348 	&movdqa	($tweak,$inout4);		# last tweak
   1349 	&jmp	(&label("xts_enc_done"));
   1350 
   1351 &set_label("xts_enc_three",16);
   1352 	&movaps	($inout5,$tweak);		# put aside last tweak
   1353 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1354 	&movups	($inout1,&QWP(16*1,$inp));
   1355 	&movups	($inout2,&QWP(16*2,$inp));
   1356 	&lea	($inp,&DWP(16*3,$inp));
   1357 	&xorps	($inout0,$inout3);		# input^=tweak
   1358 	&xorps	($inout1,$inout4);
   1359 	&xorps	($inout2,$inout5);
   1360 
   1361 	&call	("_aesni_encrypt3");
   1362 
   1363 	&xorps	($inout0,$inout3);		# output^=tweak
   1364 	&xorps	($inout1,$inout4);
   1365 	&xorps	($inout2,$inout5);
   1366 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1367 	&movups	(&QWP(16*1,$out),$inout1);
   1368 	&movups	(&QWP(16*2,$out),$inout2);
   1369 	&lea	($out,&DWP(16*3,$out));
   1370 
   1371 	&movdqa	($tweak,$inout5);		# last tweak
   1372 	&jmp	(&label("xts_enc_done"));
   1373 
   1374 &set_label("xts_enc_four",16);
   1375 	&movaps	($inout4,$tweak);		# put aside last tweak
   1376 
   1377 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1378 	&movups	($inout1,&QWP(16*1,$inp));
   1379 	&movups	($inout2,&QWP(16*2,$inp));
   1380 	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1381 	&movups	($inout3,&QWP(16*3,$inp));
   1382 	&lea	($inp,&DWP(16*4,$inp));
   1383 	&xorps	($inout1,&QWP(16*1,"esp"));
   1384 	&xorps	($inout2,$inout5);
   1385 	&xorps	($inout3,$inout4);
   1386 
   1387 	&call	("_aesni_encrypt4");
   1388 
   1389 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1390 	&xorps	($inout1,&QWP(16*1,"esp"));
   1391 	&xorps	($inout2,$inout5);
   1392 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1393 	&xorps	($inout3,$inout4);
   1394 	&movups	(&QWP(16*1,$out),$inout1);
   1395 	&movups	(&QWP(16*2,$out),$inout2);
   1396 	&movups	(&QWP(16*3,$out),$inout3);
   1397 	&lea	($out,&DWP(16*4,$out));
   1398 
   1399 	&movdqa	($tweak,$inout4);		# last tweak
   1400 	&jmp	(&label("xts_enc_done"));
   1401 
   1402 &set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
   1403 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1404 	&and	($len,15);
   1405 	&jz	(&label("xts_enc_ret"));
   1406 	&movdqa	($inout3,$tweak);
   1407 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1408 	&jmp	(&label("xts_enc_steal"));
   1409 
   1410 &set_label("xts_enc_done",16);
   1411 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1412 	&pxor	($twtmp,$twtmp);
   1413 	&and	($len,15);
   1414 	&jz	(&label("xts_enc_ret"));
   1415 
   1416 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1417 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1418 	&pshufd	($inout3,$twtmp,0x13);
   1419 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1420 	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
   1421 	&pxor	($inout3,$tweak);
   1422 
   1423 &set_label("xts_enc_steal");
   1424 	&movz	($rounds,&BP(0,$inp));
   1425 	&movz	($key,&BP(-16,$out));
   1426 	&lea	($inp,&DWP(1,$inp));
   1427 	&mov	(&BP(-16,$out),&LB($rounds));
   1428 	&mov	(&BP(0,$out),&LB($key));
   1429 	&lea	($out,&DWP(1,$out));
   1430 	&sub	($len,1);
   1431 	&jnz	(&label("xts_enc_steal"));
   1432 
   1433 	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
   1434 	&mov	($key,$key_);			# restore $key
   1435 	&mov	($rounds,$rounds_);		# restore $rounds
   1436 
   1437 	&movups	($inout0,&QWP(-16,$out));	# load input
   1438 	&xorps	($inout0,$inout3);		# input^=tweak
   1439 	if ($inline)
   1440 	{   &aesni_inline_generate1("enc");	}
   1441 	else
   1442 	{   &call	("_aesni_encrypt1");	}
   1443 	&xorps	($inout0,$inout3);		# output^=tweak
   1444 	&movups	(&QWP(-16,$out),$inout0);	# write output
   1445 
   1446 &set_label("xts_enc_ret");
   1447 	&pxor	("xmm0","xmm0");		# clear register bank
   1448 	&pxor	("xmm1","xmm1");
   1449 	&pxor	("xmm2","xmm2");
   1450 	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
   1451 	&pxor	("xmm3","xmm3");
   1452 	&movdqa	(&QWP(16*1,"esp"),"xmm0");
   1453 	&pxor	("xmm4","xmm4");
   1454 	&movdqa	(&QWP(16*2,"esp"),"xmm0");
   1455 	&pxor	("xmm5","xmm5");
   1456 	&movdqa	(&QWP(16*3,"esp"),"xmm0");
   1457 	&pxor	("xmm6","xmm6");
   1458 	&movdqa	(&QWP(16*4,"esp"),"xmm0");
   1459 	&pxor	("xmm7","xmm7");
   1460 	&movdqa	(&QWP(16*5,"esp"),"xmm0");
   1461 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
   1462 &function_end("aesni_xts_encrypt");
   1463 
   1464 &function_begin("aesni_xts_decrypt");
   1465 	&mov	($key,&wparam(4));		# key2
   1466 	&mov	($inp,&wparam(5));		# clear-text tweak
   1467 
   1468 	&mov	($rounds,&DWP(240,$key));	# key2->rounds
   1469 	&movups	($inout0,&QWP(0,$inp));
   1470 	if ($inline)
   1471 	{   &aesni_inline_generate1("enc");	}
   1472 	else
   1473 	{   &call	("_aesni_encrypt1");	}
   1474 
   1475 	&mov	($inp,&wparam(0));
   1476 	&mov	($out,&wparam(1));
   1477 	&mov	($len,&wparam(2));
   1478 	&mov	($key,&wparam(3));		# key1
   1479 
   1480 	&mov	($key_,"esp");
   1481 	&sub	("esp",16*7+8);
   1482 	&and	("esp",-16);			# align stack
   1483 
   1484 	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
   1485 	&test	($len,15);
   1486 	&setnz	(&LB($rounds_));
   1487 	&shl	($rounds_,4);
   1488 	&sub	($len,$rounds_);
   1489 
   1490 	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
   1491 	&mov	(&DWP(16*6+4,"esp"),0);
   1492 	&mov	(&DWP(16*6+8,"esp"),1);
   1493 	&mov	(&DWP(16*6+12,"esp"),0);
   1494 	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
   1495 	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
   1496 
   1497 	&mov	($rounds,&DWP(240,$key));	# key1->rounds
   1498 	&mov	($key_,$key);			# backup $key
   1499 	&mov	($rounds_,$rounds);		# backup $rounds
   1500 
   1501 	&movdqa	($tweak,$inout0);
   1502 	&pxor	($twtmp,$twtmp);
   1503 	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
   1504 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1505 
   1506 	&and	($len,-16);
   1507 	&sub	($len,16*6);
   1508 	&jc	(&label("xts_dec_short"));
   1509 
   1510 	&shl	($rounds,4);
   1511 	&mov	($rounds_,16);
   1512 	&sub	($rounds_,$rounds);
   1513 	&lea	($key,&DWP(32,$key,$rounds));
   1514 	&jmp	(&label("xts_dec_loop6"));
   1515 
   1516 &set_label("xts_dec_loop6",16);
   1517 	for ($i=0;$i<4;$i++) {
   1518 	    &pshufd	($twres,$twtmp,0x13);
   1519 	    &pxor	($twtmp,$twtmp);
   1520 	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
   1521 	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
   1522 	    &pand	($twres,$twmask);	# isolate carry and residue
   1523 	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
   1524 	    &pxor	($tweak,$twres);
   1525 	}
   1526 	&pshufd	($inout5,$twtmp,0x13);
   1527 	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
   1528 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1529 	 &$movekey	($rndkey0,&QWP(0,$key_));
   1530 	&pand	($inout5,$twmask);		# isolate carry and residue
   1531 	 &movups	($inout0,&QWP(0,$inp));	# load input
   1532 	&pxor	($inout5,$tweak);
   1533 
   1534 	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
   1535 	&mov	($rounds,$rounds_);
   1536 	&movdqu	($inout1,&QWP(16*1,$inp));
   1537 	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
   1538 	&movdqu	($inout2,&QWP(16*2,$inp));
   1539 	 &pxor		($inout1,$rndkey0);
   1540 	&movdqu	($inout3,&QWP(16*3,$inp));
   1541 	 &pxor		($inout2,$rndkey0);
   1542 	&movdqu	($inout4,&QWP(16*4,$inp));
   1543 	 &pxor		($inout3,$rndkey0);
   1544 	&movdqu	($rndkey1,&QWP(16*5,$inp));
   1545 	 &pxor		($inout4,$rndkey0);
   1546 	&lea	($inp,&DWP(16*6,$inp));
   1547 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1548 	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
   1549 	&pxor	($inout5,$rndkey1);
   1550 
   1551 	 &$movekey	($rndkey1,&QWP(16,$key_));
   1552 	&pxor	($inout1,&QWP(16*1,"esp"));
   1553 	&pxor	($inout2,&QWP(16*2,"esp"));
   1554 	 &aesdec	($inout0,$rndkey1);
   1555 	&pxor	($inout3,&QWP(16*3,"esp"));
   1556 	&pxor	($inout4,&QWP(16*4,"esp"));
   1557 	 &aesdec	($inout1,$rndkey1);
   1558 	&pxor		($inout5,$rndkey0);
   1559 	 &$movekey	($rndkey0,&QWP(32,$key_));
   1560 	 &aesdec	($inout2,$rndkey1);
   1561 	 &aesdec	($inout3,$rndkey1);
   1562 	 &aesdec	($inout4,$rndkey1);
   1563 	 &aesdec	($inout5,$rndkey1);
   1564 	&call		(&label("_aesni_decrypt6_enter"));
   1565 
   1566 	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
   1567        &pxor	($twtmp,$twtmp);
   1568 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1569        &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
   1570 	&xorps	($inout1,&QWP(16*1,"esp"));
   1571 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1572 	&xorps	($inout2,&QWP(16*2,"esp"));
   1573 	&movups	(&QWP(16*1,$out),$inout1);
   1574 	&xorps	($inout3,&QWP(16*3,"esp"));
   1575 	&movups	(&QWP(16*2,$out),$inout2);
   1576 	&xorps	($inout4,&QWP(16*4,"esp"));
   1577 	&movups	(&QWP(16*3,$out),$inout3);
   1578 	&xorps	($inout5,$tweak);
   1579 	&movups	(&QWP(16*4,$out),$inout4);
   1580        &pshufd	($twres,$twtmp,0x13);
   1581 	&movups	(&QWP(16*5,$out),$inout5);
   1582 	&lea	($out,&DWP(16*6,$out));
   1583        &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
   1584 
   1585 	&pxor	($twtmp,$twtmp);
   1586 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1587 	&pand	($twres,$twmask);		# isolate carry and residue
   1588 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1589 	&pxor	($tweak,$twres);
   1590 
   1591 	&sub	($len,16*6);
   1592 	&jnc	(&label("xts_dec_loop6"));
   1593 
   1594 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1595 	&mov	($key,$key_);			# restore $key
   1596 	&mov	($rounds_,$rounds);
   1597 
   1598 &set_label("xts_dec_short");
   1599 	&add	($len,16*6);
   1600 	&jz	(&label("xts_dec_done6x"));
   1601 
   1602 	&movdqa	($inout3,$tweak);		# put aside previous tweak
   1603 	&cmp	($len,0x20);
   1604 	&jb	(&label("xts_dec_one"));
   1605 
   1606 	&pshufd	($twres,$twtmp,0x13);
   1607 	&pxor	($twtmp,$twtmp);
   1608 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1609 	&pand	($twres,$twmask);		# isolate carry and residue
   1610 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1611 	&pxor	($tweak,$twres);
   1612 	&je	(&label("xts_dec_two"));
   1613 
   1614 	&pshufd	($twres,$twtmp,0x13);
   1615 	&pxor	($twtmp,$twtmp);
   1616 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1617 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1618 	&pand	($twres,$twmask);		# isolate carry and residue
   1619 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1620 	&pxor	($tweak,$twres);
   1621 	&cmp	($len,0x40);
   1622 	&jb	(&label("xts_dec_three"));
   1623 
   1624 	&pshufd	($twres,$twtmp,0x13);
   1625 	&pxor	($twtmp,$twtmp);
   1626 	&movdqa	($inout5,$tweak);		# put aside previous tweak
   1627 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1628 	&pand	($twres,$twmask);		# isolate carry and residue
   1629 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1630 	&pxor	($tweak,$twres);
   1631 	&movdqa	(&QWP(16*0,"esp"),$inout3);
   1632 	&movdqa	(&QWP(16*1,"esp"),$inout4);
   1633 	&je	(&label("xts_dec_four"));
   1634 
   1635 	&movdqa	(&QWP(16*2,"esp"),$inout5);
   1636 	&pshufd	($inout5,$twtmp,0x13);
   1637 	&movdqa	(&QWP(16*3,"esp"),$tweak);
   1638 	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
   1639 	&pand	($inout5,$twmask);		# isolate carry and residue
   1640 	&pxor	($inout5,$tweak);
   1641 
   1642 	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
   1643 	&movdqu	($inout1,&QWP(16*1,$inp));
   1644 	&movdqu	($inout2,&QWP(16*2,$inp));
   1645 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1646 	&movdqu	($inout3,&QWP(16*3,$inp));
   1647 	&pxor	($inout1,&QWP(16*1,"esp"));
   1648 	&movdqu	($inout4,&QWP(16*4,$inp));
   1649 	&pxor	($inout2,&QWP(16*2,"esp"));
   1650 	&lea	($inp,&DWP(16*5,$inp));
   1651 	&pxor	($inout3,&QWP(16*3,"esp"));
   1652 	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
   1653 	&pxor	($inout4,$inout5);
   1654 
   1655 	&call	("_aesni_decrypt6");
   1656 
   1657 	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
   1658 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1659 	&xorps	($inout1,&QWP(16*1,"esp"));
   1660 	&xorps	($inout2,&QWP(16*2,"esp"));
   1661 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1662 	&xorps	($inout3,&QWP(16*3,"esp"));
   1663 	&movups	(&QWP(16*1,$out),$inout1);
   1664 	&xorps	($inout4,$tweak);
   1665 	&movups	(&QWP(16*2,$out),$inout2);
   1666 	&movups	(&QWP(16*3,$out),$inout3);
   1667 	&movups	(&QWP(16*4,$out),$inout4);
   1668 	&lea	($out,&DWP(16*5,$out));
   1669 	&jmp	(&label("xts_dec_done"));
   1670 
   1671 &set_label("xts_dec_one",16);
   1672 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1673 	&lea	($inp,&DWP(16*1,$inp));
   1674 	&xorps	($inout0,$inout3);		# input^=tweak
   1675 	if ($inline)
   1676 	{   &aesni_inline_generate1("dec");	}
   1677 	else
   1678 	{   &call	("_aesni_decrypt1");	}
   1679 	&xorps	($inout0,$inout3);		# output^=tweak
   1680 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1681 	&lea	($out,&DWP(16*1,$out));
   1682 
   1683 	&movdqa	($tweak,$inout3);		# last tweak
   1684 	&jmp	(&label("xts_dec_done"));
   1685 
   1686 &set_label("xts_dec_two",16);
   1687 	&movaps	($inout4,$tweak);		# put aside last tweak
   1688 
   1689 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1690 	&movups	($inout1,&QWP(16*1,$inp));
   1691 	&lea	($inp,&DWP(16*2,$inp));
   1692 	&xorps	($inout0,$inout3);		# input^=tweak
   1693 	&xorps	($inout1,$inout4);
   1694 
   1695 	&call	("_aesni_decrypt2");
   1696 
   1697 	&xorps	($inout0,$inout3);		# output^=tweak
   1698 	&xorps	($inout1,$inout4);
   1699 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1700 	&movups	(&QWP(16*1,$out),$inout1);
   1701 	&lea	($out,&DWP(16*2,$out));
   1702 
   1703 	&movdqa	($tweak,$inout4);		# last tweak
   1704 	&jmp	(&label("xts_dec_done"));
   1705 
   1706 &set_label("xts_dec_three",16);
   1707 	&movaps	($inout5,$tweak);		# put aside last tweak
   1708 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1709 	&movups	($inout1,&QWP(16*1,$inp));
   1710 	&movups	($inout2,&QWP(16*2,$inp));
   1711 	&lea	($inp,&DWP(16*3,$inp));
   1712 	&xorps	($inout0,$inout3);		# input^=tweak
   1713 	&xorps	($inout1,$inout4);
   1714 	&xorps	($inout2,$inout5);
   1715 
   1716 	&call	("_aesni_decrypt3");
   1717 
   1718 	&xorps	($inout0,$inout3);		# output^=tweak
   1719 	&xorps	($inout1,$inout4);
   1720 	&xorps	($inout2,$inout5);
   1721 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1722 	&movups	(&QWP(16*1,$out),$inout1);
   1723 	&movups	(&QWP(16*2,$out),$inout2);
   1724 	&lea	($out,&DWP(16*3,$out));
   1725 
   1726 	&movdqa	($tweak,$inout5);		# last tweak
   1727 	&jmp	(&label("xts_dec_done"));
   1728 
   1729 &set_label("xts_dec_four",16);
   1730 	&movaps	($inout4,$tweak);		# put aside last tweak
   1731 
   1732 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1733 	&movups	($inout1,&QWP(16*1,$inp));
   1734 	&movups	($inout2,&QWP(16*2,$inp));
   1735 	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1736 	&movups	($inout3,&QWP(16*3,$inp));
   1737 	&lea	($inp,&DWP(16*4,$inp));
   1738 	&xorps	($inout1,&QWP(16*1,"esp"));
   1739 	&xorps	($inout2,$inout5);
   1740 	&xorps	($inout3,$inout4);
   1741 
   1742 	&call	("_aesni_decrypt4");
   1743 
   1744 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1745 	&xorps	($inout1,&QWP(16*1,"esp"));
   1746 	&xorps	($inout2,$inout5);
   1747 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1748 	&xorps	($inout3,$inout4);
   1749 	&movups	(&QWP(16*1,$out),$inout1);
   1750 	&movups	(&QWP(16*2,$out),$inout2);
   1751 	&movups	(&QWP(16*3,$out),$inout3);
   1752 	&lea	($out,&DWP(16*4,$out));
   1753 
   1754 	&movdqa	($tweak,$inout4);		# last tweak
   1755 	&jmp	(&label("xts_dec_done"));
   1756 
   1757 &set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
   1758 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1759 	&and	($len,15);
   1760 	&jz	(&label("xts_dec_ret"));
   1761 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1762 	&jmp	(&label("xts_dec_only_one_more"));
   1763 
   1764 &set_label("xts_dec_done",16);
   1765 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1766 	&pxor	($twtmp,$twtmp);
   1767 	&and	($len,15);
   1768 	&jz	(&label("xts_dec_ret"));
   1769 
   1770 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1771 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1772 	&pshufd	($twres,$twtmp,0x13);
   1773 	&pxor	($twtmp,$twtmp);
   1774 	&movdqa	($twmask,&QWP(16*6,"esp"));
   1775 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1776 	&pand	($twres,$twmask);		# isolate carry and residue
   1777 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1778 	&pxor	($tweak,$twres);
   1779 
   1780 &set_label("xts_dec_only_one_more");
   1781 	&pshufd	($inout3,$twtmp,0x13);
   1782 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1783 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1784 	&pand	($inout3,$twmask);		# isolate carry and residue
   1785 	&pxor	($inout3,$tweak);
   1786 
   1787 	&mov	($key,$key_);			# restore $key
   1788 	&mov	($rounds,$rounds_);		# restore $rounds
   1789 
   1790 	&movups	($inout0,&QWP(0,$inp));		# load input
   1791 	&xorps	($inout0,$inout3);		# input^=tweak
   1792 	if ($inline)
   1793 	{   &aesni_inline_generate1("dec");	}
   1794 	else
   1795 	{   &call	("_aesni_decrypt1");	}
   1796 	&xorps	($inout0,$inout3);		# output^=tweak
   1797 	&movups	(&QWP(0,$out),$inout0);		# write output
   1798 
   1799 &set_label("xts_dec_steal");
   1800 	&movz	($rounds,&BP(16,$inp));
   1801 	&movz	($key,&BP(0,$out));
   1802 	&lea	($inp,&DWP(1,$inp));
   1803 	&mov	(&BP(0,$out),&LB($rounds));
   1804 	&mov	(&BP(16,$out),&LB($key));
   1805 	&lea	($out,&DWP(1,$out));
   1806 	&sub	($len,1);
   1807 	&jnz	(&label("xts_dec_steal"));
   1808 
   1809 	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
   1810 	&mov	($key,$key_);			# restore $key
   1811 	&mov	($rounds,$rounds_);		# restore $rounds
   1812 
   1813 	&movups	($inout0,&QWP(0,$out));		# load input
   1814 	&xorps	($inout0,$inout4);		# input^=tweak
   1815 	if ($inline)
   1816 	{   &aesni_inline_generate1("dec");	}
   1817 	else
   1818 	{   &call	("_aesni_decrypt1");	}
   1819 	&xorps	($inout0,$inout4);		# output^=tweak
   1820 	&movups	(&QWP(0,$out),$inout0);		# write output
   1821 
   1822 &set_label("xts_dec_ret");
   1823 	&pxor	("xmm0","xmm0");		# clear register bank
   1824 	&pxor	("xmm1","xmm1");
   1825 	&pxor	("xmm2","xmm2");
   1826 	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
   1827 	&pxor	("xmm3","xmm3");
   1828 	&movdqa	(&QWP(16*1,"esp"),"xmm0");
   1829 	&pxor	("xmm4","xmm4");
   1830 	&movdqa	(&QWP(16*2,"esp"),"xmm0");
   1831 	&pxor	("xmm5","xmm5");
   1832 	&movdqa	(&QWP(16*3,"esp"),"xmm0");
   1833 	&pxor	("xmm6","xmm6");
   1834 	&movdqa	(&QWP(16*4,"esp"),"xmm0");
   1835 	&pxor	("xmm7","xmm7");
   1836 	&movdqa	(&QWP(16*5,"esp"),"xmm0");
   1837 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
   1838 &function_end("aesni_xts_decrypt");
   1839 }
   1840 }
   1841 
   1843 ######################################################################
   1844 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
   1845 #                           size_t length, const AES_KEY *key,
   1846 #                           unsigned char *ivp,const int enc);
   1847 &function_begin("${PREFIX}_cbc_encrypt");
   1848 	&mov	($inp,&wparam(0));
   1849 	&mov	($rounds_,"esp");
   1850 	&mov	($out,&wparam(1));
   1851 	&sub	($rounds_,24);
   1852 	&mov	($len,&wparam(2));
   1853 	&and	($rounds_,-16);
   1854 	&mov	($key,&wparam(3));
   1855 	&mov	($key_,&wparam(4));
   1856 	&test	($len,$len);
   1857 	&jz	(&label("cbc_abort"));
   1858 
   1859 	&cmp	(&wparam(5),0);
   1860 	&xchg	($rounds_,"esp");		# alloca
   1861 	&movups	($ivec,&QWP(0,$key_));		# load IV
   1862 	&mov	($rounds,&DWP(240,$key));
   1863 	&mov	($key_,$key);			# backup $key
   1864 	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
   1865 	&mov	($rounds_,$rounds);		# backup $rounds
   1866 	&je	(&label("cbc_decrypt"));
   1867 
   1868 	&movaps	($inout0,$ivec);
   1869 	&cmp	($len,16);
   1870 	&jb	(&label("cbc_enc_tail"));
   1871 	&sub	($len,16);
   1872 	&jmp	(&label("cbc_enc_loop"));
   1873 
   1874 &set_label("cbc_enc_loop",16);
   1875 	&movups	($ivec,&QWP(0,$inp));		# input actually
   1876 	&lea	($inp,&DWP(16,$inp));
   1877 	if ($inline)
   1878 	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
   1879 	else
   1880 	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
   1881 	&mov	($rounds,$rounds_);	# restore $rounds
   1882 	&mov	($key,$key_);		# restore $key
   1883 	&movups	(&QWP(0,$out),$inout0);	# store output
   1884 	&lea	($out,&DWP(16,$out));
   1885 	&sub	($len,16);
   1886 	&jnc	(&label("cbc_enc_loop"));
   1887 	&add	($len,16);
   1888 	&jnz	(&label("cbc_enc_tail"));
   1889 	&movaps	($ivec,$inout0);
   1890 	&pxor	($inout0,$inout0);
   1891 	&jmp	(&label("cbc_ret"));
   1892 
   1893 &set_label("cbc_enc_tail");
   1894 	&mov	("ecx",$len);		# zaps $rounds
   1895 	&data_word(0xA4F3F689);		# rep movsb
   1896 	&mov	("ecx",16);		# zero tail
   1897 	&sub	("ecx",$len);
   1898 	&xor	("eax","eax");		# zaps $len
   1899 	&data_word(0xAAF3F689);		# rep stosb
   1900 	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
   1901 	&mov	($rounds,$rounds_);	# restore $rounds
   1902 	&mov	($inp,$out);		# $inp and $out are the same
   1903 	&mov	($key,$key_);		# restore $key
   1904 	&jmp	(&label("cbc_enc_loop"));
   1905 ######################################################################
   1906 &set_label("cbc_decrypt",16);
   1907 	&cmp	($len,0x50);
   1908 	&jbe	(&label("cbc_dec_tail"));
   1909 	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
   1910 	&sub	($len,0x50);
   1911 	&jmp	(&label("cbc_dec_loop6_enter"));
   1912 
   1913 &set_label("cbc_dec_loop6",16);
   1914 	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
   1915 	&movups	(&QWP(0,$out),$inout5);
   1916 	&lea	($out,&DWP(0x10,$out));
   1917 &set_label("cbc_dec_loop6_enter");
   1918 	&movdqu	($inout0,&QWP(0,$inp));
   1919 	&movdqu	($inout1,&QWP(0x10,$inp));
   1920 	&movdqu	($inout2,&QWP(0x20,$inp));
   1921 	&movdqu	($inout3,&QWP(0x30,$inp));
   1922 	&movdqu	($inout4,&QWP(0x40,$inp));
   1923 	&movdqu	($inout5,&QWP(0x50,$inp));
   1924 
   1925 	&call	("_aesni_decrypt6");
   1926 
   1927 	&movups	($rndkey1,&QWP(0,$inp));
   1928 	&movups	($rndkey0,&QWP(0x10,$inp));
   1929 	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
   1930 	&xorps	($inout1,$rndkey1);
   1931 	&movups	($rndkey1,&QWP(0x20,$inp));
   1932 	&xorps	($inout2,$rndkey0);
   1933 	&movups	($rndkey0,&QWP(0x30,$inp));
   1934 	&xorps	($inout3,$rndkey1);
   1935 	&movups	($rndkey1,&QWP(0x40,$inp));
   1936 	&xorps	($inout4,$rndkey0);
   1937 	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
   1938 	&xorps	($inout5,$rndkey1);
   1939 	&movups	(&QWP(0,$out),$inout0);
   1940 	&movups	(&QWP(0x10,$out),$inout1);
   1941 	&lea	($inp,&DWP(0x60,$inp));
   1942 	&movups	(&QWP(0x20,$out),$inout2);
   1943 	&mov	($rounds,$rounds_);		# restore $rounds
   1944 	&movups	(&QWP(0x30,$out),$inout3);
   1945 	&mov	($key,$key_);			# restore $key
   1946 	&movups	(&QWP(0x40,$out),$inout4);
   1947 	&lea	($out,&DWP(0x50,$out));
   1948 	&sub	($len,0x60);
   1949 	&ja	(&label("cbc_dec_loop6"));
   1950 
   1951 	&movaps	($inout0,$inout5);
   1952 	&movaps	($ivec,$rndkey0);
   1953 	&add	($len,0x50);
   1954 	&jle	(&label("cbc_dec_clear_tail_collected"));
   1955 	&movups	(&QWP(0,$out),$inout0);
   1956 	&lea	($out,&DWP(0x10,$out));
   1957 &set_label("cbc_dec_tail");
   1958 	&movups	($inout0,&QWP(0,$inp));
   1959 	&movaps	($in0,$inout0);
   1960 	&cmp	($len,0x10);
   1961 	&jbe	(&label("cbc_dec_one"));
   1962 
   1963 	&movups	($inout1,&QWP(0x10,$inp));
   1964 	&movaps	($in1,$inout1);
   1965 	&cmp	($len,0x20);
   1966 	&jbe	(&label("cbc_dec_two"));
   1967 
   1968 	&movups	($inout2,&QWP(0x20,$inp));
   1969 	&cmp	($len,0x30);
   1970 	&jbe	(&label("cbc_dec_three"));
   1971 
   1972 	&movups	($inout3,&QWP(0x30,$inp));
   1973 	&cmp	($len,0x40);
   1974 	&jbe	(&label("cbc_dec_four"));
   1975 
   1976 	&movups	($inout4,&QWP(0x40,$inp));
   1977 	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
   1978 	&movups	($inout0,&QWP(0,$inp));
   1979 	&xorps	($inout5,$inout5);
   1980 	&call	("_aesni_decrypt6");
   1981 	&movups	($rndkey1,&QWP(0,$inp));
   1982 	&movups	($rndkey0,&QWP(0x10,$inp));
   1983 	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
   1984 	&xorps	($inout1,$rndkey1);
   1985 	&movups	($rndkey1,&QWP(0x20,$inp));
   1986 	&xorps	($inout2,$rndkey0);
   1987 	&movups	($rndkey0,&QWP(0x30,$inp));
   1988 	&xorps	($inout3,$rndkey1);
   1989 	&movups	($ivec,&QWP(0x40,$inp));	# IV
   1990 	&xorps	($inout4,$rndkey0);
   1991 	&movups	(&QWP(0,$out),$inout0);
   1992 	&movups	(&QWP(0x10,$out),$inout1);
   1993 	&pxor	($inout1,$inout1);
   1994 	&movups	(&QWP(0x20,$out),$inout2);
   1995 	&pxor	($inout2,$inout2);
   1996 	&movups	(&QWP(0x30,$out),$inout3);
   1997 	&pxor	($inout3,$inout3);
   1998 	&lea	($out,&DWP(0x40,$out));
   1999 	&movaps	($inout0,$inout4);
   2000 	&pxor	($inout4,$inout4);
   2001 	&sub	($len,0x50);
   2002 	&jmp	(&label("cbc_dec_tail_collected"));
   2003 
   2004 &set_label("cbc_dec_one",16);
   2005 	if ($inline)
   2006 	{   &aesni_inline_generate1("dec");	}
   2007 	else
   2008 	{   &call	("_aesni_decrypt1");	}
   2009 	&xorps	($inout0,$ivec);
   2010 	&movaps	($ivec,$in0);
   2011 	&sub	($len,0x10);
   2012 	&jmp	(&label("cbc_dec_tail_collected"));
   2013 
   2014 &set_label("cbc_dec_two",16);
   2015 	&call	("_aesni_decrypt2");
   2016 	&xorps	($inout0,$ivec);
   2017 	&xorps	($inout1,$in0);
   2018 	&movups	(&QWP(0,$out),$inout0);
   2019 	&movaps	($inout0,$inout1);
   2020 	&pxor	($inout1,$inout1);
   2021 	&lea	($out,&DWP(0x10,$out));
   2022 	&movaps	($ivec,$in1);
   2023 	&sub	($len,0x20);
   2024 	&jmp	(&label("cbc_dec_tail_collected"));
   2025 
   2026 &set_label("cbc_dec_three",16);
   2027 	&call	("_aesni_decrypt3");
   2028 	&xorps	($inout0,$ivec);
   2029 	&xorps	($inout1,$in0);
   2030 	&xorps	($inout2,$in1);
   2031 	&movups	(&QWP(0,$out),$inout0);
   2032 	&movaps	($inout0,$inout2);
   2033 	&pxor	($inout2,$inout2);
   2034 	&movups	(&QWP(0x10,$out),$inout1);
   2035 	&pxor	($inout1,$inout1);
   2036 	&lea	($out,&DWP(0x20,$out));
   2037 	&movups	($ivec,&QWP(0x20,$inp));
   2038 	&sub	($len,0x30);
   2039 	&jmp	(&label("cbc_dec_tail_collected"));
   2040 
   2041 &set_label("cbc_dec_four",16);
   2042 	&call	("_aesni_decrypt4");
   2043 	&movups	($rndkey1,&QWP(0x10,$inp));
   2044 	&movups	($rndkey0,&QWP(0x20,$inp));
   2045 	&xorps	($inout0,$ivec);
   2046 	&movups	($ivec,&QWP(0x30,$inp));
   2047 	&xorps	($inout1,$in0);
   2048 	&movups	(&QWP(0,$out),$inout0);
   2049 	&xorps	($inout2,$rndkey1);
   2050 	&movups	(&QWP(0x10,$out),$inout1);
   2051 	&pxor	($inout1,$inout1);
   2052 	&xorps	($inout3,$rndkey0);
   2053 	&movups	(&QWP(0x20,$out),$inout2);
   2054 	&pxor	($inout2,$inout2);
   2055 	&lea	($out,&DWP(0x30,$out));
   2056 	&movaps	($inout0,$inout3);
   2057 	&pxor	($inout3,$inout3);
   2058 	&sub	($len,0x40);
   2059 	&jmp	(&label("cbc_dec_tail_collected"));
   2060 
   2061 &set_label("cbc_dec_clear_tail_collected",16);
   2062 	&pxor	($inout1,$inout1);
   2063 	&pxor	($inout2,$inout2);
   2064 	&pxor	($inout3,$inout3);
   2065 	&pxor	($inout4,$inout4);
   2066 &set_label("cbc_dec_tail_collected");
   2067 	&and	($len,15);
   2068 	&jnz	(&label("cbc_dec_tail_partial"));
   2069 	&movups	(&QWP(0,$out),$inout0);
   2070 	&pxor	($rndkey0,$rndkey0);
   2071 	&jmp	(&label("cbc_ret"));
   2072 
   2073 &set_label("cbc_dec_tail_partial",16);
   2074 	&movaps	(&QWP(0,"esp"),$inout0);
   2075 	&pxor	($rndkey0,$rndkey0);
   2076 	&mov	("ecx",16);
   2077 	&mov	($inp,"esp");
   2078 	&sub	("ecx",$len);
   2079 	&data_word(0xA4F3F689);		# rep movsb
   2080 	&movdqa	(&QWP(0,"esp"),$inout0);
   2081 
   2082 &set_label("cbc_ret");
   2083 	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
   2084 	&mov	($key_,&wparam(4));
   2085 	&pxor	($inout0,$inout0);
   2086 	&pxor	($rndkey1,$rndkey1);
   2087 	&movups	(&QWP(0,$key_),$ivec);	# output IV
   2088 	&pxor	($ivec,$ivec);
   2089 &set_label("cbc_abort");
   2090 &function_end("${PREFIX}_cbc_encrypt");
   2091 
   2093 ######################################################################
   2094 # Mechanical port from aesni-x86_64.pl.
   2095 #
   2096 # _aesni_set_encrypt_key is private interface,
   2097 # input:
   2098 #	"eax"	const unsigned char *userKey
   2099 #	$rounds	int bits
   2100 #	$key	AES_KEY *key
   2101 # output:
   2102 #	"eax"	return code
   2103 #	$round	rounds
   2104 
   2105 &function_begin_B("_aesni_set_encrypt_key");
   2106 	&push	("ebp");
   2107 	&push	("ebx");
   2108 	&test	("eax","eax");
   2109 	&jz	(&label("bad_pointer"));
   2110 	&test	($key,$key);
   2111 	&jz	(&label("bad_pointer"));
   2112 
   2113 	&call	(&label("pic"));
   2114 &set_label("pic");
   2115 	&blindpop("ebx");
   2116 	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
   2117 
   2118 	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
   2119 	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
   2120 	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
   2121 	&mov	("ebp",&DWP(4,"ebp"));
   2122 	&lea	($key,&DWP(16,$key));
   2123 	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
   2124 	&cmp	($rounds,256);
   2125 	&je	(&label("14rounds"));
   2126 	&cmp	($rounds,192);
   2127 	&je	(&label("12rounds"));
   2128 	&cmp	($rounds,128);
   2129 	&jne	(&label("bad_keybits"));
   2130 
   2131 &set_label("10rounds",16);
   2132 	&cmp		("ebp",1<<28);
   2133 	&je		(&label("10rounds_alt"));
   2134 
   2135 	&mov		($rounds,9);
   2136 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
   2137 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
   2138 	&call		(&label("key_128_cold"));
   2139 	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
   2140 	&call		(&label("key_128"));
   2141 	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
   2142 	&call		(&label("key_128"));
   2143 	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
   2144 	&call		(&label("key_128"));
   2145 	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
   2146 	&call		(&label("key_128"));
   2147 	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
   2148 	&call		(&label("key_128"));
   2149 	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
   2150 	&call		(&label("key_128"));
   2151 	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
   2152 	&call		(&label("key_128"));
   2153 	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
   2154 	&call		(&label("key_128"));
   2155 	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
   2156 	&call		(&label("key_128"));
   2157 	&$movekey	(&QWP(0,$key),"xmm0");
   2158 	&mov		(&DWP(80,$key),$rounds);
   2159 
   2160 	&jmp	(&label("good_key"));
   2161 
   2162 &set_label("key_128",16);
   2163 	&$movekey	(&QWP(0,$key),"xmm0");
   2164 	&lea		($key,&DWP(16,$key));
   2165 &set_label("key_128_cold");
   2166 	&shufps		("xmm4","xmm0",0b00010000);
   2167 	&xorps		("xmm0","xmm4");
   2168 	&shufps		("xmm4","xmm0",0b10001100);
   2169 	&xorps		("xmm0","xmm4");
   2170 	&shufps		("xmm1","xmm1",0b11111111);	# critical path
   2171 	&xorps		("xmm0","xmm1");
   2172 	&ret();
   2173 
   2174 &set_label("10rounds_alt",16);
   2175 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
   2176 	&mov		($rounds,8);
   2177 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2178 	&movdqa		("xmm2","xmm0");
   2179 	&movdqu		(&QWP(-16,$key),"xmm0");
   2180 
   2181 &set_label("loop_key128");
   2182 	&pshufb		("xmm0","xmm5");
   2183 	&aesenclast	("xmm0","xmm4");
   2184 	&pslld		("xmm4",1);
   2185 	&lea		($key,&DWP(16,$key));
   2186 
   2187 	&movdqa		("xmm3","xmm2");
   2188 	&pslldq		("xmm2",4);
   2189 	&pxor		("xmm3","xmm2");
   2190 	&pslldq		("xmm2",4);
   2191 	&pxor		("xmm3","xmm2");
   2192 	&pslldq		("xmm2",4);
   2193 	&pxor		("xmm2","xmm3");
   2194 
   2195 	&pxor		("xmm0","xmm2");
   2196 	&movdqu		(&QWP(-16,$key),"xmm0");
   2197 	&movdqa		("xmm2","xmm0");
   2198 
   2199 	&dec		($rounds);
   2200 	&jnz		(&label("loop_key128"));
   2201 
   2202 	&movdqa		("xmm4",&QWP(0x30,"ebx"));
   2203 
   2204 	&pshufb		("xmm0","xmm5");
   2205 	&aesenclast	("xmm0","xmm4");
   2206 	&pslld		("xmm4",1);
   2207 
   2208 	&movdqa		("xmm3","xmm2");
   2209 	&pslldq		("xmm2",4);
   2210 	&pxor		("xmm3","xmm2");
   2211 	&pslldq		("xmm2",4);
   2212 	&pxor		("xmm3","xmm2");
   2213 	&pslldq		("xmm2",4);
   2214 	&pxor		("xmm2","xmm3");
   2215 
   2216 	&pxor		("xmm0","xmm2");
   2217 	&movdqu		(&QWP(0,$key),"xmm0");
   2218 
   2219 	&movdqa		("xmm2","xmm0");
   2220 	&pshufb		("xmm0","xmm5");
   2221 	&aesenclast	("xmm0","xmm4");
   2222 
   2223 	&movdqa		("xmm3","xmm2");
   2224 	&pslldq		("xmm2",4);
   2225 	&pxor		("xmm3","xmm2");
   2226 	&pslldq		("xmm2",4);
   2227 	&pxor		("xmm3","xmm2");
   2228 	&pslldq		("xmm2",4);
   2229 	&pxor		("xmm2","xmm3");
   2230 
   2231 	&pxor		("xmm0","xmm2");
   2232 	&movdqu		(&QWP(16,$key),"xmm0");
   2233 
   2234 	&mov		($rounds,9);
   2235 	&mov		(&DWP(96,$key),$rounds);
   2236 
   2237 	&jmp	(&label("good_key"));
   2238 
   2239 &set_label("12rounds",16);
   2240 	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
   2241 	&cmp		("ebp",1<<28);
   2242 	&je		(&label("12rounds_alt"));
   2243 
   2244 	&mov		($rounds,11);
   2245 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
   2246 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
   2247 	&call		(&label("key_192a_cold"));
   2248 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
   2249 	&call		(&label("key_192b"));
   2250 	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
   2251 	&call		(&label("key_192a"));
   2252 	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
   2253 	&call		(&label("key_192b"));
   2254 	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
   2255 	&call		(&label("key_192a"));
   2256 	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
   2257 	&call		(&label("key_192b"));
   2258 	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
   2259 	&call		(&label("key_192a"));
   2260 	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
   2261 	&call		(&label("key_192b"));
   2262 	&$movekey	(&QWP(0,$key),"xmm0");
   2263 	&mov		(&DWP(48,$key),$rounds);
   2264 
   2265 	&jmp	(&label("good_key"));
   2266 
   2267 &set_label("key_192a",16);
   2268 	&$movekey	(&QWP(0,$key),"xmm0");
   2269 	&lea		($key,&DWP(16,$key));
   2270 &set_label("key_192a_cold",16);
   2271 	&movaps		("xmm5","xmm2");
   2272 &set_label("key_192b_warm");
   2273 	&shufps		("xmm4","xmm0",0b00010000);
   2274 	&movdqa		("xmm3","xmm2");
   2275 	&xorps		("xmm0","xmm4");
   2276 	&shufps		("xmm4","xmm0",0b10001100);
   2277 	&pslldq		("xmm3",4);
   2278 	&xorps		("xmm0","xmm4");
   2279 	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
   2280 	&pxor		("xmm2","xmm3");
   2281 	&pxor		("xmm0","xmm1");
   2282 	&pshufd		("xmm3","xmm0",0b11111111);
   2283 	&pxor		("xmm2","xmm3");
   2284 	&ret();
   2285 
   2286 &set_label("key_192b",16);
   2287 	&movaps		("xmm3","xmm0");
   2288 	&shufps		("xmm5","xmm0",0b01000100);
   2289 	&$movekey	(&QWP(0,$key),"xmm5");
   2290 	&shufps		("xmm3","xmm2",0b01001110);
   2291 	&$movekey	(&QWP(16,$key),"xmm3");
   2292 	&lea		($key,&DWP(32,$key));
   2293 	&jmp		(&label("key_192b_warm"));
   2294 
   2295 &set_label("12rounds_alt",16);
   2296 	&movdqa		("xmm5",&QWP(0x10,"ebx"));
   2297 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2298 	&mov		($rounds,8);
   2299 	&movdqu		(&QWP(-16,$key),"xmm0");
   2300 
   2301 &set_label("loop_key192");
   2302 	&movq		(&QWP(0,$key),"xmm2");
   2303 	&movdqa		("xmm1","xmm2");
   2304 	&pshufb		("xmm2","xmm5");
   2305 	&aesenclast	("xmm2","xmm4");
   2306 	&pslld		("xmm4",1);
   2307 	&lea		($key,&DWP(24,$key));
   2308 
   2309 	&movdqa		("xmm3","xmm0");
   2310 	&pslldq		("xmm0",4);
   2311 	&pxor		("xmm3","xmm0");
   2312 	&pslldq		("xmm0",4);
   2313 	&pxor		("xmm3","xmm0");
   2314 	&pslldq		("xmm0",4);
   2315 	&pxor		("xmm0","xmm3");
   2316 
   2317 	&pshufd		("xmm3","xmm0",0xff);
   2318 	&pxor		("xmm3","xmm1");
   2319 	&pslldq		("xmm1",4);
   2320 	&pxor		("xmm3","xmm1");
   2321 
   2322 	&pxor		("xmm0","xmm2");
   2323 	&pxor		("xmm2","xmm3");
   2324 	&movdqu		(&QWP(-16,$key),"xmm0");
   2325 
   2326 	&dec		($rounds);
   2327 	&jnz		(&label("loop_key192"));
   2328 
   2329 	&mov	($rounds,11);
   2330 	&mov	(&DWP(32,$key),$rounds);
   2331 
   2332 	&jmp	(&label("good_key"));
   2333 
   2334 &set_label("14rounds",16);
   2335 	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
   2336 	&lea		($key,&DWP(16,$key));
   2337 	&cmp		("ebp",1<<28);
   2338 	&je		(&label("14rounds_alt"));
   2339 
   2340 	&mov		($rounds,13);
   2341 	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
   2342 	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
   2343 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
   2344 	&call		(&label("key_256a_cold"));
   2345 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
   2346 	&call		(&label("key_256b"));
   2347 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
   2348 	&call		(&label("key_256a"));
   2349 	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
   2350 	&call		(&label("key_256b"));
   2351 	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
   2352 	&call		(&label("key_256a"));
   2353 	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
   2354 	&call		(&label("key_256b"));
   2355 	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
   2356 	&call		(&label("key_256a"));
   2357 	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
   2358 	&call		(&label("key_256b"));
   2359 	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
   2360 	&call		(&label("key_256a"));
   2361 	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
   2362 	&call		(&label("key_256b"));
   2363 	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
   2364 	&call		(&label("key_256a"));
   2365 	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
   2366 	&call		(&label("key_256b"));
   2367 	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
   2368 	&call		(&label("key_256a"));
   2369 	&$movekey	(&QWP(0,$key),"xmm0");
   2370 	&mov		(&DWP(16,$key),$rounds);
   2371 	&xor		("eax","eax");
   2372 
   2373 	&jmp	(&label("good_key"));
   2374 
   2375 &set_label("key_256a",16);
   2376 	&$movekey	(&QWP(0,$key),"xmm2");
   2377 	&lea		($key,&DWP(16,$key));
   2378 &set_label("key_256a_cold");
   2379 	&shufps		("xmm4","xmm0",0b00010000);
   2380 	&xorps		("xmm0","xmm4");
   2381 	&shufps		("xmm4","xmm0",0b10001100);
   2382 	&xorps		("xmm0","xmm4");
   2383 	&shufps		("xmm1","xmm1",0b11111111);	# critical path
   2384 	&xorps		("xmm0","xmm1");
   2385 	&ret();
   2386 
   2387 &set_label("key_256b",16);
   2388 	&$movekey	(&QWP(0,$key),"xmm0");
   2389 	&lea		($key,&DWP(16,$key));
   2390 
   2391 	&shufps		("xmm4","xmm2",0b00010000);
   2392 	&xorps		("xmm2","xmm4");
   2393 	&shufps		("xmm4","xmm2",0b10001100);
   2394 	&xorps		("xmm2","xmm4");
   2395 	&shufps		("xmm1","xmm1",0b10101010);	# critical path
   2396 	&xorps		("xmm2","xmm1");
   2397 	&ret();
   2398 
   2399 &set_label("14rounds_alt",16);
   2400 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
   2401 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2402 	&mov		($rounds,7);
   2403 	&movdqu		(&QWP(-32,$key),"xmm0");
   2404 	&movdqa		("xmm1","xmm2");
   2405 	&movdqu		(&QWP(-16,$key),"xmm2");
   2406 
   2407 &set_label("loop_key256");
   2408 	&pshufb		("xmm2","xmm5");
   2409 	&aesenclast	("xmm2","xmm4");
   2410 
   2411 	&movdqa		("xmm3","xmm0");
   2412 	&pslldq		("xmm0",4);
   2413 	&pxor		("xmm3","xmm0");
   2414 	&pslldq		("xmm0",4);
   2415 	&pxor		("xmm3","xmm0");
   2416 	&pslldq		("xmm0",4);
   2417 	&pxor		("xmm0","xmm3");
   2418 	&pslld		("xmm4",1);
   2419 
   2420 	&pxor		("xmm0","xmm2");
   2421 	&movdqu		(&QWP(0,$key),"xmm0");
   2422 
   2423 	&dec		($rounds);
   2424 	&jz		(&label("done_key256"));
   2425 
   2426 	&pshufd		("xmm2","xmm0",0xff);
   2427 	&pxor		("xmm3","xmm3");
   2428 	&aesenclast	("xmm2","xmm3");
   2429 
   2430 	&movdqa		("xmm3","xmm1")
   2431 	&pslldq		("xmm1",4);
   2432 	&pxor		("xmm3","xmm1");
   2433 	&pslldq		("xmm1",4);
   2434 	&pxor		("xmm3","xmm1");
   2435 	&pslldq		("xmm1",4);
   2436 	&pxor		("xmm1","xmm3");
   2437 
   2438 	&pxor		("xmm2","xmm1");
   2439 	&movdqu		(&QWP(16,$key),"xmm2");
   2440 	&lea		($key,&DWP(32,$key));
   2441 	&movdqa		("xmm1","xmm2");
   2442 	&jmp		(&label("loop_key256"));
   2443 
   2444 &set_label("done_key256");
   2445 	&mov		($rounds,13);
   2446 	&mov		(&DWP(16,$key),$rounds);
   2447 
   2448 &set_label("good_key");
   2449 	&pxor	("xmm0","xmm0");
   2450 	&pxor	("xmm1","xmm1");
   2451 	&pxor	("xmm2","xmm2");
   2452 	&pxor	("xmm3","xmm3");
   2453 	&pxor	("xmm4","xmm4");
   2454 	&pxor	("xmm5","xmm5");
   2455 	&xor	("eax","eax");
   2456 	&pop	("ebx");
   2457 	&pop	("ebp");
   2458 	&ret	();
   2459 
   2460 &set_label("bad_pointer",4);
   2461 	&mov	("eax",-1);
   2462 	&pop	("ebx");
   2463 	&pop	("ebp");
   2464 	&ret	();
   2465 &set_label("bad_keybits",4);
   2466 	&pxor	("xmm0","xmm0");
   2467 	&mov	("eax",-2);
   2468 	&pop	("ebx");
   2469 	&pop	("ebp");
   2470 	&ret	();
   2471 &function_end_B("_aesni_set_encrypt_key");
   2472 
   2473 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
   2474 #                              AES_KEY *key)
   2475 &function_begin_B("${PREFIX}_set_encrypt_key");
   2476 	&mov	("eax",&wparam(0));
   2477 	&mov	($rounds,&wparam(1));
   2478 	&mov	($key,&wparam(2));
   2479 	&call	("_aesni_set_encrypt_key");
   2480 	&ret	();
   2481 &function_end_B("${PREFIX}_set_encrypt_key");
   2482 
   2483 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
   2484 #                              AES_KEY *key)
   2485 &function_begin_B("${PREFIX}_set_decrypt_key");
   2486 	&mov	("eax",&wparam(0));
   2487 	&mov	($rounds,&wparam(1));
   2488 	&mov	($key,&wparam(2));
   2489 	&call	("_aesni_set_encrypt_key");
   2490 	&mov	($key,&wparam(2));
   2491 	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
   2492 	&test	("eax","eax");
   2493 	&jnz	(&label("dec_key_ret"));
   2494 	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
   2495 
   2496 	&$movekey	("xmm0",&QWP(0,$key));	# just swap
   2497 	&$movekey	("xmm1",&QWP(0,"eax"));
   2498 	&$movekey	(&QWP(0,"eax"),"xmm0");
   2499 	&$movekey	(&QWP(0,$key),"xmm1");
   2500 	&lea		($key,&DWP(16,$key));
   2501 	&lea		("eax",&DWP(-16,"eax"));
   2502 
   2503 &set_label("dec_key_inverse");
   2504 	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
   2505 	&$movekey	("xmm1",&QWP(0,"eax"));
   2506 	&aesimc		("xmm0","xmm0");
   2507 	&aesimc		("xmm1","xmm1");
   2508 	&lea		($key,&DWP(16,$key));
   2509 	&lea		("eax",&DWP(-16,"eax"));
   2510 	&$movekey	(&QWP(16,"eax"),"xmm0");
   2511 	&$movekey	(&QWP(-16,$key),"xmm1");
   2512 	&cmp		("eax",$key);
   2513 	&ja		(&label("dec_key_inverse"));
   2514 
   2515 	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
   2516 	&aesimc		("xmm0","xmm0");
   2517 	&$movekey	(&QWP(0,$key),"xmm0");
   2518 
   2519 	&pxor		("xmm0","xmm0");
   2520 	&pxor		("xmm1","xmm1");
   2521 	&xor		("eax","eax");		# return success
   2522 &set_label("dec_key_ret");
   2523 	&ret	();
   2524 &function_end_B("${PREFIX}_set_decrypt_key");
   2525 
   2526 &set_label("key_const",64);
   2527 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
   2528 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
   2529 &data_word(1,1,1,1);
   2530 &data_word(0x1b,0x1b,0x1b,0x1b);
   2531 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
   2532 
   2533 &asm_finish();
   2534