Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # This module implements support for Intel AES-NI extension. In
     11 # OpenSSL context it's used with Intel engine, but can also be used as
     12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
     13 # details].
     14 #
     15 # Performance.
     16 #
     17 # To start with see corresponding paragraph in aesni-x86_64.pl...
     18 # Instead of filling table similar to one found there I've chosen to
     19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
     20 # The simplified table below represents 32-bit performance relative
     21 # to 64-bit one in every given point. Ratios vary for different
     22 # encryption modes, therefore interval values.
     23 #
     24 #	16-byte     64-byte     256-byte    1-KB        8-KB
     25 #	53-67%      67-84%      91-94%      95-98%      97-99.5%
     26 #
     27 # Lower ratios for smaller block sizes are perfectly understandable,
     28 # because function call overhead is higher in 32-bit mode. Largest
     29 # 8-KB block performance is virtually same: 32-bit code is less than
     30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
     31 
     32 # January 2011
     33 #
     34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
     35 # interleaves at most 6 aes[enc|dec] instructions, because there are
     36 # not enough registers for 8x interleave [which should be optimal for
     37 # Sandy Bridge]. Actually, performance results for 6x interleave
     38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
     39 # module.
     40 
     41 # April 2011
     42 #
     43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
     44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
     45 
     46 ######################################################################
     47 # Current large-block performance in cycles per byte processed with
     48 # 128-bit key (less is better).
     49 #
     50 #		CBC en-/decrypt	CTR	XTS	ECB
     51 # Westmere	3.77/1.37	1.37	1.52	1.27
     52 # * Bridge	5.07/0.98	0.99	1.09	0.91
     53 # Haswell	4.44/0.80	0.97	1.03	0.72
     54 # Skylake	2.68/0.65	0.65	0.66	0.64
     55 # Silvermont	5.77/3.56	3.67	4.03	3.46
     56 # Goldmont	3.84/1.39	1.39	1.63	1.31
     57 # Bulldozer	5.80/0.98	1.05	1.24	0.93
     58 
     59 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
     60 			# generates drop-in replacement for
     61 			# crypto/aes/asm/aes-586.pl:-)
     62 $inline=1;		# inline _aesni_[en|de]crypt
     63 
     64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     65 push(@INC,"${dir}","${dir}../../perlasm");
     66 require "x86asm.pl";
     67 
     68 $output = pop;
     69 open OUT,">$output";
     70 *STDOUT=*OUT;
     71 
     72 &asm_init($ARGV[0],$0);
     73 
     74 &external_label("OPENSSL_ia32cap_P");
     75 &static_label("key_const");
     76 
     77 if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
     78 else			{ $movekey=\&movups; }
     79 
     80 $len="eax";
     81 $rounds="ecx";
     82 $key="edx";
     83 $inp="esi";
     84 $out="edi";
     85 $rounds_="ebx";	# backup copy for $rounds
     86 $key_="ebp";	# backup copy for $key
     87 
     88 $rndkey0="xmm0";
     89 $rndkey1="xmm1";
     90 $inout0="xmm2";
     91 $inout1="xmm3";
     92 $inout2="xmm4";
     93 $inout3="xmm5";	$in1="xmm5";
     94 $inout4="xmm6";	$in0="xmm6";
     95 $inout5="xmm7";	$ivec="xmm7";
     96 
     97 # AESNI extension
     98 sub aeskeygenassist
     99 { my($dst,$src,$imm)=@_;
    100     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    101     {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
    102 }
    103 sub aescommon
    104 { my($opcodelet,$dst,$src)=@_;
    105     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    106     {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
    107 }
    108 sub aesimc	{ aescommon(0xdb,@_); }
    109 sub aesenc	{ aescommon(0xdc,@_); }
    110 sub aesenclast	{ aescommon(0xdd,@_); }
    111 sub aesdec	{ aescommon(0xde,@_); }
    112 sub aesdeclast	{ aescommon(0xdf,@_); }
    113 
    115 # Inline version of internal aesni_[en|de]crypt1
    116 { my $sn;
    117 sub aesni_inline_generate1
    118 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
    119   $sn++;
    120 
    121     &$movekey		($rndkey0,&QWP(0,$key));
    122     &$movekey		($rndkey1,&QWP(16,$key));
    123     &xorps		($ivec,$rndkey0)	if (defined($ivec));
    124     &lea		($key,&DWP(32,$key));
    125     &xorps		($inout,$ivec)		if (defined($ivec));
    126     &xorps		($inout,$rndkey0)	if (!defined($ivec));
    127     &set_label("${p}1_loop_$sn");
    128 	eval"&aes${p}	($inout,$rndkey1)";
    129 	&dec		($rounds);
    130 	&$movekey	($rndkey1,&QWP(0,$key));
    131 	&lea		($key,&DWP(16,$key));
    132     &jnz		(&label("${p}1_loop_$sn"));
    133     eval"&aes${p}last	($inout,$rndkey1)";
    134 }}
    135 
    136 sub aesni_generate1	# fully unrolled loop
    137 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
    138 
    139     &function_begin_B("_aesni_${p}rypt1");
    140 	&movups		($rndkey0,&QWP(0,$key));
    141 	&$movekey	($rndkey1,&QWP(0x10,$key));
    142 	&xorps		($inout,$rndkey0);
    143 	&$movekey	($rndkey0,&QWP(0x20,$key));
    144 	&lea		($key,&DWP(0x30,$key));
    145 	&cmp		($rounds,11);
    146 	&jb		(&label("${p}128"));
    147 	&lea		($key,&DWP(0x20,$key));
    148 	&je		(&label("${p}192"));
    149 	&lea		($key,&DWP(0x20,$key));
    150 	eval"&aes${p}	($inout,$rndkey1)";
    151 	&$movekey	($rndkey1,&QWP(-0x40,$key));
    152 	eval"&aes${p}	($inout,$rndkey0)";
    153 	&$movekey	($rndkey0,&QWP(-0x30,$key));
    154     &set_label("${p}192");
    155 	eval"&aes${p}	($inout,$rndkey1)";
    156 	&$movekey	($rndkey1,&QWP(-0x20,$key));
    157 	eval"&aes${p}	($inout,$rndkey0)";
    158 	&$movekey	($rndkey0,&QWP(-0x10,$key));
    159     &set_label("${p}128");
    160 	eval"&aes${p}	($inout,$rndkey1)";
    161 	&$movekey	($rndkey1,&QWP(0,$key));
    162 	eval"&aes${p}	($inout,$rndkey0)";
    163 	&$movekey	($rndkey0,&QWP(0x10,$key));
    164 	eval"&aes${p}	($inout,$rndkey1)";
    165 	&$movekey	($rndkey1,&QWP(0x20,$key));
    166 	eval"&aes${p}	($inout,$rndkey0)";
    167 	&$movekey	($rndkey0,&QWP(0x30,$key));
    168 	eval"&aes${p}	($inout,$rndkey1)";
    169 	&$movekey	($rndkey1,&QWP(0x40,$key));
    170 	eval"&aes${p}	($inout,$rndkey0)";
    171 	&$movekey	($rndkey0,&QWP(0x50,$key));
    172 	eval"&aes${p}	($inout,$rndkey1)";
    173 	&$movekey	($rndkey1,&QWP(0x60,$key));
    174 	eval"&aes${p}	($inout,$rndkey0)";
    175 	&$movekey	($rndkey0,&QWP(0x70,$key));
    176 	eval"&aes${p}	($inout,$rndkey1)";
    177     eval"&aes${p}last	($inout,$rndkey0)";
    178     &ret();
    179     &function_end_B("_aesni_${p}rypt1");
    180 }
    181 
    183 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
    184 &aesni_generate1("enc") if (!$inline);
    185 &function_begin_B("${PREFIX}_encrypt");
    186 	&mov	("eax",&wparam(0));
    187 	&mov	($key,&wparam(2));
    188 	&movups	($inout0,&QWP(0,"eax"));
    189 	&mov	($rounds,&DWP(240,$key));
    190 	&mov	("eax",&wparam(1));
    191 	if ($inline)
    192 	{   &aesni_inline_generate1("enc");	}
    193 	else
    194 	{   &call	("_aesni_encrypt1");	}
    195 	&pxor	($rndkey0,$rndkey0);		# clear register bank
    196 	&pxor	($rndkey1,$rndkey1);
    197 	&movups	(&QWP(0,"eax"),$inout0);
    198 	&pxor	($inout0,$inout0);
    199 	&ret	();
    200 &function_end_B("${PREFIX}_encrypt");
    201 
    202 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
    203 &aesni_generate1("dec") if(!$inline);
    204 &function_begin_B("${PREFIX}_decrypt");
    205 	&mov	("eax",&wparam(0));
    206 	&mov	($key,&wparam(2));
    207 	&movups	($inout0,&QWP(0,"eax"));
    208 	&mov	($rounds,&DWP(240,$key));
    209 	&mov	("eax",&wparam(1));
    210 	if ($inline)
    211 	{   &aesni_inline_generate1("dec");	}
    212 	else
    213 	{   &call	("_aesni_decrypt1");	}
    214 	&pxor	($rndkey0,$rndkey0);		# clear register bank
    215 	&pxor	($rndkey1,$rndkey1);
    216 	&movups	(&QWP(0,"eax"),$inout0);
    217 	&pxor	($inout0,$inout0);
    218 	&ret	();
    219 &function_end_B("${PREFIX}_decrypt");
    220 
    221 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
    222 # factor. Why 3x subroutine were originally used in loops? Even though
    223 # aes[enc|dec] latency was originally 6, it could be scheduled only
    224 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
    225 # utilization, i.e. when subroutine's throughput is virtually same as
    226 # of non-interleaved subroutine [for number of input blocks up to 3].
    227 # This is why it originally made no sense to implement 2x subroutine.
    228 # But times change and it became appropriate to spend extra 192 bytes
    229 # on 2x subroutine on Atom Silvermont account. For processors that
    230 # can schedule aes[enc|dec] every cycle optimal interleave factor
    231 # equals to corresponding instructions latency. 8x is optimal for
    232 # * Bridge, but it's unfeasible to accommodate such implementation
    233 # in XMM registers addreassable in 32-bit mode and therefore maximum
    234 # of 6x is used instead...
    235 
    236 sub aesni_generate2
    237 { my $p=shift;
    238 
    239     &function_begin_B("_aesni_${p}rypt2");
    240 	&$movekey	($rndkey0,&QWP(0,$key));
    241 	&shl		($rounds,4);
    242 	&$movekey	($rndkey1,&QWP(16,$key));
    243 	&xorps		($inout0,$rndkey0);
    244 	&pxor		($inout1,$rndkey0);
    245 	&$movekey	($rndkey0,&QWP(32,$key));
    246 	&lea		($key,&DWP(32,$key,$rounds));
    247 	&neg		($rounds);
    248 	&add		($rounds,16);
    249 
    250     &set_label("${p}2_loop");
    251 	eval"&aes${p}	($inout0,$rndkey1)";
    252 	eval"&aes${p}	($inout1,$rndkey1)";
    253 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    254 	&add		($rounds,32);
    255 	eval"&aes${p}	($inout0,$rndkey0)";
    256 	eval"&aes${p}	($inout1,$rndkey0)";
    257 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    258 	&jnz		(&label("${p}2_loop"));
    259     eval"&aes${p}	($inout0,$rndkey1)";
    260     eval"&aes${p}	($inout1,$rndkey1)";
    261     eval"&aes${p}last	($inout0,$rndkey0)";
    262     eval"&aes${p}last	($inout1,$rndkey0)";
    263     &ret();
    264     &function_end_B("_aesni_${p}rypt2");
    265 }
    266 
    267 sub aesni_generate3
    268 { my $p=shift;
    269 
    270     &function_begin_B("_aesni_${p}rypt3");
    271 	&$movekey	($rndkey0,&QWP(0,$key));
    272 	&shl		($rounds,4);
    273 	&$movekey	($rndkey1,&QWP(16,$key));
    274 	&xorps		($inout0,$rndkey0);
    275 	&pxor		($inout1,$rndkey0);
    276 	&pxor		($inout2,$rndkey0);
    277 	&$movekey	($rndkey0,&QWP(32,$key));
    278 	&lea		($key,&DWP(32,$key,$rounds));
    279 	&neg		($rounds);
    280 	&add		($rounds,16);
    281 
    282     &set_label("${p}3_loop");
    283 	eval"&aes${p}	($inout0,$rndkey1)";
    284 	eval"&aes${p}	($inout1,$rndkey1)";
    285 	eval"&aes${p}	($inout2,$rndkey1)";
    286 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    287 	&add		($rounds,32);
    288 	eval"&aes${p}	($inout0,$rndkey0)";
    289 	eval"&aes${p}	($inout1,$rndkey0)";
    290 	eval"&aes${p}	($inout2,$rndkey0)";
    291 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    292 	&jnz		(&label("${p}3_loop"));
    293     eval"&aes${p}	($inout0,$rndkey1)";
    294     eval"&aes${p}	($inout1,$rndkey1)";
    295     eval"&aes${p}	($inout2,$rndkey1)";
    296     eval"&aes${p}last	($inout0,$rndkey0)";
    297     eval"&aes${p}last	($inout1,$rndkey0)";
    298     eval"&aes${p}last	($inout2,$rndkey0)";
    299     &ret();
    300     &function_end_B("_aesni_${p}rypt3");
    301 }
    302 
    303 # 4x interleave is implemented to improve small block performance,
    304 # most notably [and naturally] 4 block by ~30%. One can argue that one
    305 # should have implemented 5x as well, but improvement  would be <20%,
    306 # so it's not worth it...
    307 sub aesni_generate4
    308 { my $p=shift;
    309 
    310     &function_begin_B("_aesni_${p}rypt4");
    311 	&$movekey	($rndkey0,&QWP(0,$key));
    312 	&$movekey	($rndkey1,&QWP(16,$key));
    313 	&shl		($rounds,4);
    314 	&xorps		($inout0,$rndkey0);
    315 	&pxor		($inout1,$rndkey0);
    316 	&pxor		($inout2,$rndkey0);
    317 	&pxor		($inout3,$rndkey0);
    318 	&$movekey	($rndkey0,&QWP(32,$key));
    319 	&lea		($key,&DWP(32,$key,$rounds));
    320 	&neg		($rounds);
    321 	&data_byte	(0x0f,0x1f,0x40,0x00);
    322 	&add		($rounds,16);
    323 
    324     &set_label("${p}4_loop");
    325 	eval"&aes${p}	($inout0,$rndkey1)";
    326 	eval"&aes${p}	($inout1,$rndkey1)";
    327 	eval"&aes${p}	($inout2,$rndkey1)";
    328 	eval"&aes${p}	($inout3,$rndkey1)";
    329 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    330 	&add		($rounds,32);
    331 	eval"&aes${p}	($inout0,$rndkey0)";
    332 	eval"&aes${p}	($inout1,$rndkey0)";
    333 	eval"&aes${p}	($inout2,$rndkey0)";
    334 	eval"&aes${p}	($inout3,$rndkey0)";
    335 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    336     &jnz		(&label("${p}4_loop"));
    337 
    338     eval"&aes${p}	($inout0,$rndkey1)";
    339     eval"&aes${p}	($inout1,$rndkey1)";
    340     eval"&aes${p}	($inout2,$rndkey1)";
    341     eval"&aes${p}	($inout3,$rndkey1)";
    342     eval"&aes${p}last	($inout0,$rndkey0)";
    343     eval"&aes${p}last	($inout1,$rndkey0)";
    344     eval"&aes${p}last	($inout2,$rndkey0)";
    345     eval"&aes${p}last	($inout3,$rndkey0)";
    346     &ret();
    347     &function_end_B("_aesni_${p}rypt4");
    348 }
    349 
    350 sub aesni_generate6
    351 { my $p=shift;
    352 
    353     &function_begin_B("_aesni_${p}rypt6");
    354     &static_label("_aesni_${p}rypt6_enter");
    355 	&$movekey	($rndkey0,&QWP(0,$key));
    356 	&shl		($rounds,4);
    357 	&$movekey	($rndkey1,&QWP(16,$key));
    358 	&xorps		($inout0,$rndkey0);
    359 	&pxor		($inout1,$rndkey0);	# pxor does better here
    360 	&pxor		($inout2,$rndkey0);
    361 	eval"&aes${p}	($inout0,$rndkey1)";
    362 	&pxor		($inout3,$rndkey0);
    363 	&pxor		($inout4,$rndkey0);
    364 	eval"&aes${p}	($inout1,$rndkey1)";
    365 	&lea		($key,&DWP(32,$key,$rounds));
    366 	&neg		($rounds);
    367 	eval"&aes${p}	($inout2,$rndkey1)";
    368 	&pxor		($inout5,$rndkey0);
    369 	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
    370 	&add		($rounds,16);
    371 	&jmp		(&label("_aesni_${p}rypt6_inner"));
    372 
    373     &set_label("${p}6_loop",16);
    374 	eval"&aes${p}	($inout0,$rndkey1)";
    375 	eval"&aes${p}	($inout1,$rndkey1)";
    376 	eval"&aes${p}	($inout2,$rndkey1)";
    377     &set_label("_aesni_${p}rypt6_inner");
    378 	eval"&aes${p}	($inout3,$rndkey1)";
    379 	eval"&aes${p}	($inout4,$rndkey1)";
    380 	eval"&aes${p}	($inout5,$rndkey1)";
    381     &set_label("_aesni_${p}rypt6_enter");
    382 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    383 	&add		($rounds,32);
    384 	eval"&aes${p}	($inout0,$rndkey0)";
    385 	eval"&aes${p}	($inout1,$rndkey0)";
    386 	eval"&aes${p}	($inout2,$rndkey0)";
    387 	eval"&aes${p}	($inout3,$rndkey0)";
    388 	eval"&aes${p}	($inout4,$rndkey0)";
    389 	eval"&aes${p}	($inout5,$rndkey0)";
    390 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    391     &jnz		(&label("${p}6_loop"));
    392 
    393     eval"&aes${p}	($inout0,$rndkey1)";
    394     eval"&aes${p}	($inout1,$rndkey1)";
    395     eval"&aes${p}	($inout2,$rndkey1)";
    396     eval"&aes${p}	($inout3,$rndkey1)";
    397     eval"&aes${p}	($inout4,$rndkey1)";
    398     eval"&aes${p}	($inout5,$rndkey1)";
    399     eval"&aes${p}last	($inout0,$rndkey0)";
    400     eval"&aes${p}last	($inout1,$rndkey0)";
    401     eval"&aes${p}last	($inout2,$rndkey0)";
    402     eval"&aes${p}last	($inout3,$rndkey0)";
    403     eval"&aes${p}last	($inout4,$rndkey0)";
    404     eval"&aes${p}last	($inout5,$rndkey0)";
    405     &ret();
    406     &function_end_B("_aesni_${p}rypt6");
    407 }
    408 &aesni_generate2("enc") if ($PREFIX eq "aesni");
    409 &aesni_generate2("dec");
    410 &aesni_generate3("enc") if ($PREFIX eq "aesni");
    411 &aesni_generate3("dec");
    412 &aesni_generate4("enc") if ($PREFIX eq "aesni");
    413 &aesni_generate4("dec");
    414 &aesni_generate6("enc") if ($PREFIX eq "aesni");
    415 &aesni_generate6("dec");
    416 
    418 if ($PREFIX eq "aesni") {
    419 ######################################################################
    420 # void aesni_ecb_encrypt (const void *in, void *out,
    421 #                         size_t length, const AES_KEY *key,
    422 #                         int enc);
    423 &function_begin("aesni_ecb_encrypt");
    424 	&mov	($inp,&wparam(0));
    425 	&mov	($out,&wparam(1));
    426 	&mov	($len,&wparam(2));
    427 	&mov	($key,&wparam(3));
    428 	&mov	($rounds_,&wparam(4));
    429 	&and	($len,-16);
    430 	&jz	(&label("ecb_ret"));
    431 	&mov	($rounds,&DWP(240,$key));
    432 	&test	($rounds_,$rounds_);
    433 	&jz	(&label("ecb_decrypt"));
    434 
    435 	&mov	($key_,$key);		# backup $key
    436 	&mov	($rounds_,$rounds);	# backup $rounds
    437 	&cmp	($len,0x60);
    438 	&jb	(&label("ecb_enc_tail"));
    439 
    440 	&movdqu	($inout0,&QWP(0,$inp));
    441 	&movdqu	($inout1,&QWP(0x10,$inp));
    442 	&movdqu	($inout2,&QWP(0x20,$inp));
    443 	&movdqu	($inout3,&QWP(0x30,$inp));
    444 	&movdqu	($inout4,&QWP(0x40,$inp));
    445 	&movdqu	($inout5,&QWP(0x50,$inp));
    446 	&lea	($inp,&DWP(0x60,$inp));
    447 	&sub	($len,0x60);
    448 	&jmp	(&label("ecb_enc_loop6_enter"));
    449 
    450 &set_label("ecb_enc_loop6",16);
    451 	&movups	(&QWP(0,$out),$inout0);
    452 	&movdqu	($inout0,&QWP(0,$inp));
    453 	&movups	(&QWP(0x10,$out),$inout1);
    454 	&movdqu	($inout1,&QWP(0x10,$inp));
    455 	&movups	(&QWP(0x20,$out),$inout2);
    456 	&movdqu	($inout2,&QWP(0x20,$inp));
    457 	&movups	(&QWP(0x30,$out),$inout3);
    458 	&movdqu	($inout3,&QWP(0x30,$inp));
    459 	&movups	(&QWP(0x40,$out),$inout4);
    460 	&movdqu	($inout4,&QWP(0x40,$inp));
    461 	&movups	(&QWP(0x50,$out),$inout5);
    462 	&lea	($out,&DWP(0x60,$out));
    463 	&movdqu	($inout5,&QWP(0x50,$inp));
    464 	&lea	($inp,&DWP(0x60,$inp));
    465 &set_label("ecb_enc_loop6_enter");
    466 
    467 	&call	("_aesni_encrypt6");
    468 
    469 	&mov	($key,$key_);		# restore $key
    470 	&mov	($rounds,$rounds_);	# restore $rounds
    471 	&sub	($len,0x60);
    472 	&jnc	(&label("ecb_enc_loop6"));
    473 
    474 	&movups	(&QWP(0,$out),$inout0);
    475 	&movups	(&QWP(0x10,$out),$inout1);
    476 	&movups	(&QWP(0x20,$out),$inout2);
    477 	&movups	(&QWP(0x30,$out),$inout3);
    478 	&movups	(&QWP(0x40,$out),$inout4);
    479 	&movups	(&QWP(0x50,$out),$inout5);
    480 	&lea	($out,&DWP(0x60,$out));
    481 	&add	($len,0x60);
    482 	&jz	(&label("ecb_ret"));
    483 
    484 &set_label("ecb_enc_tail");
    485 	&movups	($inout0,&QWP(0,$inp));
    486 	&cmp	($len,0x20);
    487 	&jb	(&label("ecb_enc_one"));
    488 	&movups	($inout1,&QWP(0x10,$inp));
    489 	&je	(&label("ecb_enc_two"));
    490 	&movups	($inout2,&QWP(0x20,$inp));
    491 	&cmp	($len,0x40);
    492 	&jb	(&label("ecb_enc_three"));
    493 	&movups	($inout3,&QWP(0x30,$inp));
    494 	&je	(&label("ecb_enc_four"));
    495 	&movups	($inout4,&QWP(0x40,$inp));
    496 	&xorps	($inout5,$inout5);
    497 	&call	("_aesni_encrypt6");
    498 	&movups	(&QWP(0,$out),$inout0);
    499 	&movups	(&QWP(0x10,$out),$inout1);
    500 	&movups	(&QWP(0x20,$out),$inout2);
    501 	&movups	(&QWP(0x30,$out),$inout3);
    502 	&movups	(&QWP(0x40,$out),$inout4);
    503 	jmp	(&label("ecb_ret"));
    504 
    505 &set_label("ecb_enc_one",16);
    506 	if ($inline)
    507 	{   &aesni_inline_generate1("enc");	}
    508 	else
    509 	{   &call	("_aesni_encrypt1");	}
    510 	&movups	(&QWP(0,$out),$inout0);
    511 	&jmp	(&label("ecb_ret"));
    512 
    513 &set_label("ecb_enc_two",16);
    514 	&call	("_aesni_encrypt2");
    515 	&movups	(&QWP(0,$out),$inout0);
    516 	&movups	(&QWP(0x10,$out),$inout1);
    517 	&jmp	(&label("ecb_ret"));
    518 
    519 &set_label("ecb_enc_three",16);
    520 	&call	("_aesni_encrypt3");
    521 	&movups	(&QWP(0,$out),$inout0);
    522 	&movups	(&QWP(0x10,$out),$inout1);
    523 	&movups	(&QWP(0x20,$out),$inout2);
    524 	&jmp	(&label("ecb_ret"));
    525 
    526 &set_label("ecb_enc_four",16);
    527 	&call	("_aesni_encrypt4");
    528 	&movups	(&QWP(0,$out),$inout0);
    529 	&movups	(&QWP(0x10,$out),$inout1);
    530 	&movups	(&QWP(0x20,$out),$inout2);
    531 	&movups	(&QWP(0x30,$out),$inout3);
    532 	&jmp	(&label("ecb_ret"));
    533 ######################################################################
    534 &set_label("ecb_decrypt",16);
    535 	&mov	($key_,$key);		# backup $key
    536 	&mov	($rounds_,$rounds);	# backup $rounds
    537 	&cmp	($len,0x60);
    538 	&jb	(&label("ecb_dec_tail"));
    539 
    540 	&movdqu	($inout0,&QWP(0,$inp));
    541 	&movdqu	($inout1,&QWP(0x10,$inp));
    542 	&movdqu	($inout2,&QWP(0x20,$inp));
    543 	&movdqu	($inout3,&QWP(0x30,$inp));
    544 	&movdqu	($inout4,&QWP(0x40,$inp));
    545 	&movdqu	($inout5,&QWP(0x50,$inp));
    546 	&lea	($inp,&DWP(0x60,$inp));
    547 	&sub	($len,0x60);
    548 	&jmp	(&label("ecb_dec_loop6_enter"));
    549 
    550 &set_label("ecb_dec_loop6",16);
    551 	&movups	(&QWP(0,$out),$inout0);
    552 	&movdqu	($inout0,&QWP(0,$inp));
    553 	&movups	(&QWP(0x10,$out),$inout1);
    554 	&movdqu	($inout1,&QWP(0x10,$inp));
    555 	&movups	(&QWP(0x20,$out),$inout2);
    556 	&movdqu	($inout2,&QWP(0x20,$inp));
    557 	&movups	(&QWP(0x30,$out),$inout3);
    558 	&movdqu	($inout3,&QWP(0x30,$inp));
    559 	&movups	(&QWP(0x40,$out),$inout4);
    560 	&movdqu	($inout4,&QWP(0x40,$inp));
    561 	&movups	(&QWP(0x50,$out),$inout5);
    562 	&lea	($out,&DWP(0x60,$out));
    563 	&movdqu	($inout5,&QWP(0x50,$inp));
    564 	&lea	($inp,&DWP(0x60,$inp));
    565 &set_label("ecb_dec_loop6_enter");
    566 
    567 	&call	("_aesni_decrypt6");
    568 
    569 	&mov	($key,$key_);		# restore $key
    570 	&mov	($rounds,$rounds_);	# restore $rounds
    571 	&sub	($len,0x60);
    572 	&jnc	(&label("ecb_dec_loop6"));
    573 
    574 	&movups	(&QWP(0,$out),$inout0);
    575 	&movups	(&QWP(0x10,$out),$inout1);
    576 	&movups	(&QWP(0x20,$out),$inout2);
    577 	&movups	(&QWP(0x30,$out),$inout3);
    578 	&movups	(&QWP(0x40,$out),$inout4);
    579 	&movups	(&QWP(0x50,$out),$inout5);
    580 	&lea	($out,&DWP(0x60,$out));
    581 	&add	($len,0x60);
    582 	&jz	(&label("ecb_ret"));
    583 
    584 &set_label("ecb_dec_tail");
    585 	&movups	($inout0,&QWP(0,$inp));
    586 	&cmp	($len,0x20);
    587 	&jb	(&label("ecb_dec_one"));
    588 	&movups	($inout1,&QWP(0x10,$inp));
    589 	&je	(&label("ecb_dec_two"));
    590 	&movups	($inout2,&QWP(0x20,$inp));
    591 	&cmp	($len,0x40);
    592 	&jb	(&label("ecb_dec_three"));
    593 	&movups	($inout3,&QWP(0x30,$inp));
    594 	&je	(&label("ecb_dec_four"));
    595 	&movups	($inout4,&QWP(0x40,$inp));
    596 	&xorps	($inout5,$inout5);
    597 	&call	("_aesni_decrypt6");
    598 	&movups	(&QWP(0,$out),$inout0);
    599 	&movups	(&QWP(0x10,$out),$inout1);
    600 	&movups	(&QWP(0x20,$out),$inout2);
    601 	&movups	(&QWP(0x30,$out),$inout3);
    602 	&movups	(&QWP(0x40,$out),$inout4);
    603 	&jmp	(&label("ecb_ret"));
    604 
    605 &set_label("ecb_dec_one",16);
    606 	if ($inline)
    607 	{   &aesni_inline_generate1("dec");	}
    608 	else
    609 	{   &call	("_aesni_decrypt1");	}
    610 	&movups	(&QWP(0,$out),$inout0);
    611 	&jmp	(&label("ecb_ret"));
    612 
    613 &set_label("ecb_dec_two",16);
    614 	&call	("_aesni_decrypt2");
    615 	&movups	(&QWP(0,$out),$inout0);
    616 	&movups	(&QWP(0x10,$out),$inout1);
    617 	&jmp	(&label("ecb_ret"));
    618 
    619 &set_label("ecb_dec_three",16);
    620 	&call	("_aesni_decrypt3");
    621 	&movups	(&QWP(0,$out),$inout0);
    622 	&movups	(&QWP(0x10,$out),$inout1);
    623 	&movups	(&QWP(0x20,$out),$inout2);
    624 	&jmp	(&label("ecb_ret"));
    625 
    626 &set_label("ecb_dec_four",16);
    627 	&call	("_aesni_decrypt4");
    628 	&movups	(&QWP(0,$out),$inout0);
    629 	&movups	(&QWP(0x10,$out),$inout1);
    630 	&movups	(&QWP(0x20,$out),$inout2);
    631 	&movups	(&QWP(0x30,$out),$inout3);
    632 
    633 &set_label("ecb_ret");
    634 	&pxor	("xmm0","xmm0");		# clear register bank
    635 	&pxor	("xmm1","xmm1");
    636 	&pxor	("xmm2","xmm2");
    637 	&pxor	("xmm3","xmm3");
    638 	&pxor	("xmm4","xmm4");
    639 	&pxor	("xmm5","xmm5");
    640 	&pxor	("xmm6","xmm6");
    641 	&pxor	("xmm7","xmm7");
    642 &function_end("aesni_ecb_encrypt");
    643 
    645 ######################################################################
    646 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
    647 #                         size_t blocks, const AES_KEY *key,
    648 #                         const char *ivec,char *cmac);
    649 #
    650 # Handles only complete blocks, operates on 64-bit counter and
    651 # does not update *ivec! Nor does it finalize CMAC value
    652 # (see engine/eng_aesni.c for details)
    653 #
    654 { my $cmac=$inout1;
    655 &function_begin("aesni_ccm64_encrypt_blocks");
    656 	&mov	($inp,&wparam(0));
    657 	&mov	($out,&wparam(1));
    658 	&mov	($len,&wparam(2));
    659 	&mov	($key,&wparam(3));
    660 	&mov	($rounds_,&wparam(4));
    661 	&mov	($rounds,&wparam(5));
    662 	&mov	($key_,"esp");
    663 	&sub	("esp",60);
    664 	&and	("esp",-16);			# align stack
    665 	&mov	(&DWP(48,"esp"),$key_);
    666 
    667 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
    668 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
    669 	&mov	($rounds,&DWP(240,$key));
    670 
    671 	# compose byte-swap control mask for pshufb on stack
    672 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    673 	&mov	(&DWP(4,"esp"),0x08090a0b);
    674 	&mov	(&DWP(8,"esp"),0x04050607);
    675 	&mov	(&DWP(12,"esp"),0x00010203);
    676 
    677 	# compose counter increment vector on stack
    678 	&mov	($rounds_,1);
    679 	&xor	($key_,$key_);
    680 	&mov	(&DWP(16,"esp"),$rounds_);
    681 	&mov	(&DWP(20,"esp"),$key_);
    682 	&mov	(&DWP(24,"esp"),$key_);
    683 	&mov	(&DWP(28,"esp"),$key_);
    684 
    685 	&shl	($rounds,4);
    686 	&mov	($rounds_,16);
    687 	&lea	($key_,&DWP(0,$key));
    688 	&movdqa	($inout3,&QWP(0,"esp"));
    689 	&movdqa	($inout0,$ivec);
    690 	&lea	($key,&DWP(32,$key,$rounds));
    691 	&sub	($rounds_,$rounds);
    692 	&pshufb	($ivec,$inout3);
    693 
    694 &set_label("ccm64_enc_outer");
    695 	&$movekey	($rndkey0,&QWP(0,$key_));
    696 	&mov		($rounds,$rounds_);
    697 	&movups		($in0,&QWP(0,$inp));
    698 
    699 	&xorps		($inout0,$rndkey0);
    700 	&$movekey	($rndkey1,&QWP(16,$key_));
    701 	&xorps		($rndkey0,$in0);
    702 	&xorps		($cmac,$rndkey0);		# cmac^=inp
    703 	&$movekey	($rndkey0,&QWP(32,$key_));
    704 
    705 &set_label("ccm64_enc2_loop");
    706 	&aesenc		($inout0,$rndkey1);
    707 	&aesenc		($cmac,$rndkey1);
    708 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    709 	&add		($rounds,32);
    710 	&aesenc		($inout0,$rndkey0);
    711 	&aesenc		($cmac,$rndkey0);
    712 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    713 	&jnz		(&label("ccm64_enc2_loop"));
    714 	&aesenc		($inout0,$rndkey1);
    715 	&aesenc		($cmac,$rndkey1);
    716 	&paddq		($ivec,&QWP(16,"esp"));
    717 	&dec		($len);
    718 	&aesenclast	($inout0,$rndkey0);
    719 	&aesenclast	($cmac,$rndkey0);
    720 
    721 	&lea	($inp,&DWP(16,$inp));
    722 	&xorps	($in0,$inout0);			# inp^=E(ivec)
    723 	&movdqa	($inout0,$ivec);
    724 	&movups	(&QWP(0,$out),$in0);		# save output
    725 	&pshufb	($inout0,$inout3);
    726 	&lea	($out,&DWP(16,$out));
    727 	&jnz	(&label("ccm64_enc_outer"));
    728 
    729 	&mov	("esp",&DWP(48,"esp"));
    730 	&mov	($out,&wparam(5));
    731 	&movups	(&QWP(0,$out),$cmac);
    732 
    733 	&pxor	("xmm0","xmm0");		# clear register bank
    734 	&pxor	("xmm1","xmm1");
    735 	&pxor	("xmm2","xmm2");
    736 	&pxor	("xmm3","xmm3");
    737 	&pxor	("xmm4","xmm4");
    738 	&pxor	("xmm5","xmm5");
    739 	&pxor	("xmm6","xmm6");
    740 	&pxor	("xmm7","xmm7");
    741 &function_end("aesni_ccm64_encrypt_blocks");
    742 
    743 &function_begin("aesni_ccm64_decrypt_blocks");
    744 	&mov	($inp,&wparam(0));
    745 	&mov	($out,&wparam(1));
    746 	&mov	($len,&wparam(2));
    747 	&mov	($key,&wparam(3));
    748 	&mov	($rounds_,&wparam(4));
    749 	&mov	($rounds,&wparam(5));
    750 	&mov	($key_,"esp");
    751 	&sub	("esp",60);
    752 	&and	("esp",-16);			# align stack
    753 	&mov	(&DWP(48,"esp"),$key_);
    754 
    755 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
    756 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
    757 	&mov	($rounds,&DWP(240,$key));
    758 
    759 	# compose byte-swap control mask for pshufb on stack
    760 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    761 	&mov	(&DWP(4,"esp"),0x08090a0b);
    762 	&mov	(&DWP(8,"esp"),0x04050607);
    763 	&mov	(&DWP(12,"esp"),0x00010203);
    764 
    765 	# compose counter increment vector on stack
    766 	&mov	($rounds_,1);
    767 	&xor	($key_,$key_);
    768 	&mov	(&DWP(16,"esp"),$rounds_);
    769 	&mov	(&DWP(20,"esp"),$key_);
    770 	&mov	(&DWP(24,"esp"),$key_);
    771 	&mov	(&DWP(28,"esp"),$key_);
    772 
    773 	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
    774 	&movdqa	($inout0,$ivec);
    775 
    776 	&mov	($key_,$key);
    777 	&mov	($rounds_,$rounds);
    778 
    779 	&pshufb	($ivec,$inout3);
    780 	if ($inline)
    781 	{   &aesni_inline_generate1("enc");	}
    782 	else
    783 	{   &call	("_aesni_encrypt1");	}
    784 	&shl	($rounds_,4);
    785 	&mov	($rounds,16);
    786 	&movups	($in0,&QWP(0,$inp));		# load inp
    787 	&paddq	($ivec,&QWP(16,"esp"));
    788 	&lea	($inp,&QWP(16,$inp));
    789 	&sub	($rounds,$rounds_);
    790 	&lea	($key,&DWP(32,$key_,$rounds_));
    791 	&mov	($rounds_,$rounds);
    792 	&jmp	(&label("ccm64_dec_outer"));
    793 
    794 &set_label("ccm64_dec_outer",16);
    795 	&xorps	($in0,$inout0);			# inp ^= E(ivec)
    796 	&movdqa	($inout0,$ivec);
    797 	&movups	(&QWP(0,$out),$in0);		# save output
    798 	&lea	($out,&DWP(16,$out));
    799 	&pshufb	($inout0,$inout3);
    800 
    801 	&sub	($len,1);
    802 	&jz	(&label("ccm64_dec_break"));
    803 
    804 	&$movekey	($rndkey0,&QWP(0,$key_));
    805 	&mov		($rounds,$rounds_);
    806 	&$movekey	($rndkey1,&QWP(16,$key_));
    807 	&xorps		($in0,$rndkey0);
    808 	&xorps		($inout0,$rndkey0);
    809 	&xorps		($cmac,$in0);		# cmac^=out
    810 	&$movekey	($rndkey0,&QWP(32,$key_));
    811 
    812 &set_label("ccm64_dec2_loop");
    813 	&aesenc		($inout0,$rndkey1);
    814 	&aesenc		($cmac,$rndkey1);
    815 	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
    816 	&add		($rounds,32);
    817 	&aesenc		($inout0,$rndkey0);
    818 	&aesenc		($cmac,$rndkey0);
    819 	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    820 	&jnz		(&label("ccm64_dec2_loop"));
    821 	&movups		($in0,&QWP(0,$inp));	# load inp
    822 	&paddq		($ivec,&QWP(16,"esp"));
    823 	&aesenc		($inout0,$rndkey1);
    824 	&aesenc		($cmac,$rndkey1);
    825 	&aesenclast	($inout0,$rndkey0);
    826 	&aesenclast	($cmac,$rndkey0);
    827 	&lea		($inp,&QWP(16,$inp));
    828 	&jmp	(&label("ccm64_dec_outer"));
    829 
    830 &set_label("ccm64_dec_break",16);
    831 	&mov	($rounds,&DWP(240,$key_));
    832 	&mov	($key,$key_);
    833 	if ($inline)
    834 	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
    835 	else
    836 	{   &call	("_aesni_encrypt1",$cmac);	}
    837 
    838 	&mov	("esp",&DWP(48,"esp"));
    839 	&mov	($out,&wparam(5));
    840 	&movups	(&QWP(0,$out),$cmac);
    841 
    842 	&pxor	("xmm0","xmm0");		# clear register bank
    843 	&pxor	("xmm1","xmm1");
    844 	&pxor	("xmm2","xmm2");
    845 	&pxor	("xmm3","xmm3");
    846 	&pxor	("xmm4","xmm4");
    847 	&pxor	("xmm5","xmm5");
    848 	&pxor	("xmm6","xmm6");
    849 	&pxor	("xmm7","xmm7");
    850 &function_end("aesni_ccm64_decrypt_blocks");
    851 }
    852 
    854 ######################################################################
    855 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
    856 #                         size_t blocks, const AES_KEY *key,
    857 #                         const char *ivec);
    858 #
    859 # Handles only complete blocks, operates on 32-bit counter and
    860 # does not update *ivec! (see crypto/modes/ctr128.c for details)
    861 #
    862 # stack layout:
    863 #	0	pshufb mask
    864 #	16	vector addend: 0,6,6,6
    865 # 	32	counter-less ivec
    866 #	48	1st triplet of counter vector
    867 #	64	2nd triplet of counter vector
    868 #	80	saved %esp
    869 
    870 &function_begin("aesni_ctr32_encrypt_blocks");
    871 	&mov	($inp,&wparam(0));
    872 	&mov	($out,&wparam(1));
    873 	&mov	($len,&wparam(2));
    874 	&mov	($key,&wparam(3));
    875 	&mov	($rounds_,&wparam(4));
    876 	&mov	($key_,"esp");
    877 	&sub	("esp",88);
    878 	&and	("esp",-16);			# align stack
    879 	&mov	(&DWP(80,"esp"),$key_);
    880 
    881 	&cmp	($len,1);
    882 	&je	(&label("ctr32_one_shortcut"));
    883 
    884 	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
    885 
    886 	# compose byte-swap control mask for pshufb on stack
    887 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
    888 	&mov	(&DWP(4,"esp"),0x08090a0b);
    889 	&mov	(&DWP(8,"esp"),0x04050607);
    890 	&mov	(&DWP(12,"esp"),0x00010203);
    891 
    892 	# compose counter increment vector on stack
    893 	&mov	($rounds,6);
    894 	&xor	($key_,$key_);
    895 	&mov	(&DWP(16,"esp"),$rounds);
    896 	&mov	(&DWP(20,"esp"),$rounds);
    897 	&mov	(&DWP(24,"esp"),$rounds);
    898 	&mov	(&DWP(28,"esp"),$key_);
    899 
    900 	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
    901 	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
    902 
    903 	&mov	($rounds,&DWP(240,$key));	# key->rounds
    904 
    905 	# compose 2 vectors of 3x32-bit counters
    906 	&bswap	($rounds_);
    907 	&pxor	($rndkey0,$rndkey0);
    908 	&pxor	($rndkey1,$rndkey1);
    909 	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
    910 	&pinsrd	($rndkey0,$rounds_,0);
    911 	&lea	($key_,&DWP(3,$rounds_));
    912 	&pinsrd	($rndkey1,$key_,0);
    913 	&inc	($rounds_);
    914 	&pinsrd	($rndkey0,$rounds_,1);
    915 	&inc	($key_);
    916 	&pinsrd	($rndkey1,$key_,1);
    917 	&inc	($rounds_);
    918 	&pinsrd	($rndkey0,$rounds_,2);
    919 	&inc	($key_);
    920 	&pinsrd	($rndkey1,$key_,2);
    921 	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
    922 	&pshufb	($rndkey0,$inout0);		# byte swap
    923 	&movdqu	($inout4,&QWP(0,$key));		# key[0]
    924 	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
    925 	&pshufb	($rndkey1,$inout0);		# byte swap
    926 
    927 	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
    928 	&pshufd	($inout1,$rndkey0,2<<6);
    929 	&cmp	($len,6);
    930 	&jb	(&label("ctr32_tail"));
    931 	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
    932 	&shl	($rounds,4);
    933 	&mov	($rounds_,16);
    934 	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
    935 	&mov	($key_,$key);			# backup $key
    936 	&sub	($rounds_,$rounds);		# backup twisted $rounds
    937 	&lea	($key,&DWP(32,$key,$rounds));
    938 	&sub	($len,6);
    939 	&jmp	(&label("ctr32_loop6"));
    940 
    941 &set_label("ctr32_loop6",16);
    942 	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
    943 	&pshufd	($inout2,$rndkey0,1<<6);
    944 	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
    945 	&pshufd	($inout3,$rndkey1,3<<6);
    946 	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
    947 	&pshufd	($inout4,$rndkey1,2<<6);
    948 	&pxor		($inout1,$rndkey0);
    949 	&pshufd	($inout5,$rndkey1,1<<6);
    950 	&$movekey	($rndkey1,&QWP(16,$key_));
    951 	&pxor		($inout2,$rndkey0);
    952 	&pxor		($inout3,$rndkey0);
    953 	&aesenc		($inout0,$rndkey1);
    954 	&pxor		($inout4,$rndkey0);
    955 	&pxor		($inout5,$rndkey0);
    956 	&aesenc		($inout1,$rndkey1);
    957 	&$movekey	($rndkey0,&QWP(32,$key_));
    958 	&mov		($rounds,$rounds_);
    959 	&aesenc		($inout2,$rndkey1);
    960 	&aesenc		($inout3,$rndkey1);
    961 	&aesenc		($inout4,$rndkey1);
    962 	&aesenc		($inout5,$rndkey1);
    963 
    964 	&call		(&label("_aesni_encrypt6_enter"));
    965 
    966 	&movups	($rndkey1,&QWP(0,$inp));
    967 	&movups	($rndkey0,&QWP(0x10,$inp));
    968 	&xorps	($inout0,$rndkey1);
    969 	&movups	($rndkey1,&QWP(0x20,$inp));
    970 	&xorps	($inout1,$rndkey0);
    971 	&movups	(&QWP(0,$out),$inout0);
    972 	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
    973 	&xorps	($inout2,$rndkey1);
    974 	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
    975 	&movups	(&QWP(0x10,$out),$inout1);
    976 	&movups	(&QWP(0x20,$out),$inout2);
    977 
    978 	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
    979 	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
    980 	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
    981 
    982 	&movups	($inout1,&QWP(0x30,$inp));
    983 	&movups	($inout2,&QWP(0x40,$inp));
    984 	&xorps	($inout3,$inout1);
    985 	&movups	($inout1,&QWP(0x50,$inp));
    986 	&lea	($inp,&DWP(0x60,$inp));
    987 	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
    988 	&pshufb	($rndkey0,$inout0);		# byte swap
    989 	&xorps	($inout4,$inout2);
    990 	&movups	(&QWP(0x30,$out),$inout3);
    991 	&xorps	($inout5,$inout1);
    992 	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
    993 	&pshufb	($rndkey1,$inout0);		# byte swap
    994 	&movups	(&QWP(0x40,$out),$inout4);
    995 	&pshufd	($inout0,$rndkey0,3<<6);
    996 	&movups	(&QWP(0x50,$out),$inout5);
    997 	&lea	($out,&DWP(0x60,$out));
    998 
    999 	&pshufd	($inout1,$rndkey0,2<<6);
   1000 	&sub	($len,6);
   1001 	&jnc	(&label("ctr32_loop6"));
   1002 
   1003 	&add	($len,6);
   1004 	&jz	(&label("ctr32_ret"));
   1005 	&movdqu	($inout5,&QWP(0,$key_));
   1006 	&mov	($key,$key_);
   1007 	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
   1008 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1009 
   1010 &set_label("ctr32_tail");
   1011 	&por	($inout0,$inout5);
   1012 	&cmp	($len,2);
   1013 	&jb	(&label("ctr32_one"));
   1014 
   1015 	&pshufd	($inout2,$rndkey0,1<<6);
   1016 	&por	($inout1,$inout5);
   1017 	&je	(&label("ctr32_two"));
   1018 
   1019 	&pshufd	($inout3,$rndkey1,3<<6);
   1020 	&por	($inout2,$inout5);
   1021 	&cmp	($len,4);
   1022 	&jb	(&label("ctr32_three"));
   1023 
   1024 	&pshufd	($inout4,$rndkey1,2<<6);
   1025 	&por	($inout3,$inout5);
   1026 	&je	(&label("ctr32_four"));
   1027 
   1028 	&por	($inout4,$inout5);
   1029 	&call	("_aesni_encrypt6");
   1030 	&movups	($rndkey1,&QWP(0,$inp));
   1031 	&movups	($rndkey0,&QWP(0x10,$inp));
   1032 	&xorps	($inout0,$rndkey1);
   1033 	&movups	($rndkey1,&QWP(0x20,$inp));
   1034 	&xorps	($inout1,$rndkey0);
   1035 	&movups	($rndkey0,&QWP(0x30,$inp));
   1036 	&xorps	($inout2,$rndkey1);
   1037 	&movups	($rndkey1,&QWP(0x40,$inp));
   1038 	&xorps	($inout3,$rndkey0);
   1039 	&movups	(&QWP(0,$out),$inout0);
   1040 	&xorps	($inout4,$rndkey1);
   1041 	&movups	(&QWP(0x10,$out),$inout1);
   1042 	&movups	(&QWP(0x20,$out),$inout2);
   1043 	&movups	(&QWP(0x30,$out),$inout3);
   1044 	&movups	(&QWP(0x40,$out),$inout4);
   1045 	&jmp	(&label("ctr32_ret"));
   1046 
   1047 &set_label("ctr32_one_shortcut",16);
   1048 	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
   1049 	&mov	($rounds,&DWP(240,$key));
   1050 
   1051 &set_label("ctr32_one");
   1052 	if ($inline)
   1053 	{   &aesni_inline_generate1("enc");	}
   1054 	else
   1055 	{   &call	("_aesni_encrypt1");	}
   1056 	&movups	($in0,&QWP(0,$inp));
   1057 	&xorps	($in0,$inout0);
   1058 	&movups	(&QWP(0,$out),$in0);
   1059 	&jmp	(&label("ctr32_ret"));
   1060 
   1061 &set_label("ctr32_two",16);
   1062 	&call	("_aesni_encrypt2");
   1063 	&movups	($inout3,&QWP(0,$inp));
   1064 	&movups	($inout4,&QWP(0x10,$inp));
   1065 	&xorps	($inout0,$inout3);
   1066 	&xorps	($inout1,$inout4);
   1067 	&movups	(&QWP(0,$out),$inout0);
   1068 	&movups	(&QWP(0x10,$out),$inout1);
   1069 	&jmp	(&label("ctr32_ret"));
   1070 
   1071 &set_label("ctr32_three",16);
   1072 	&call	("_aesni_encrypt3");
   1073 	&movups	($inout3,&QWP(0,$inp));
   1074 	&movups	($inout4,&QWP(0x10,$inp));
   1075 	&xorps	($inout0,$inout3);
   1076 	&movups	($inout5,&QWP(0x20,$inp));
   1077 	&xorps	($inout1,$inout4);
   1078 	&movups	(&QWP(0,$out),$inout0);
   1079 	&xorps	($inout2,$inout5);
   1080 	&movups	(&QWP(0x10,$out),$inout1);
   1081 	&movups	(&QWP(0x20,$out),$inout2);
   1082 	&jmp	(&label("ctr32_ret"));
   1083 
   1084 &set_label("ctr32_four",16);
   1085 	&call	("_aesni_encrypt4");
   1086 	&movups	($inout4,&QWP(0,$inp));
   1087 	&movups	($inout5,&QWP(0x10,$inp));
   1088 	&movups	($rndkey1,&QWP(0x20,$inp));
   1089 	&xorps	($inout0,$inout4);
   1090 	&movups	($rndkey0,&QWP(0x30,$inp));
   1091 	&xorps	($inout1,$inout5);
   1092 	&movups	(&QWP(0,$out),$inout0);
   1093 	&xorps	($inout2,$rndkey1);
   1094 	&movups	(&QWP(0x10,$out),$inout1);
   1095 	&xorps	($inout3,$rndkey0);
   1096 	&movups	(&QWP(0x20,$out),$inout2);
   1097 	&movups	(&QWP(0x30,$out),$inout3);
   1098 
   1099 &set_label("ctr32_ret");
   1100 	&pxor	("xmm0","xmm0");		# clear register bank
   1101 	&pxor	("xmm1","xmm1");
   1102 	&pxor	("xmm2","xmm2");
   1103 	&pxor	("xmm3","xmm3");
   1104 	&pxor	("xmm4","xmm4");
   1105 	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
   1106 	&pxor	("xmm5","xmm5");
   1107 	&movdqa	(&QWP(48,"esp"),"xmm0");
   1108 	&pxor	("xmm6","xmm6");
   1109 	&movdqa	(&QWP(64,"esp"),"xmm0");
   1110 	&pxor	("xmm7","xmm7");
   1111 	&mov	("esp",&DWP(80,"esp"));
   1112 &function_end("aesni_ctr32_encrypt_blocks");
   1113 
   1115 ######################################################################
   1116 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
   1117 #	const AES_KEY *key1, const AES_KEY *key2
   1118 #	const unsigned char iv[16]);
   1119 #
   1120 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
   1121 
   1122 &function_begin("aesni_xts_encrypt");
   1123 	&mov	($key,&wparam(4));		# key2
   1124 	&mov	($inp,&wparam(5));		# clear-text tweak
   1125 
   1126 	&mov	($rounds,&DWP(240,$key));	# key2->rounds
   1127 	&movups	($inout0,&QWP(0,$inp));
   1128 	if ($inline)
   1129 	{   &aesni_inline_generate1("enc");	}
   1130 	else
   1131 	{   &call	("_aesni_encrypt1");	}
   1132 
   1133 	&mov	($inp,&wparam(0));
   1134 	&mov	($out,&wparam(1));
   1135 	&mov	($len,&wparam(2));
   1136 	&mov	($key,&wparam(3));		# key1
   1137 
   1138 	&mov	($key_,"esp");
   1139 	&sub	("esp",16*7+8);
   1140 	&mov	($rounds,&DWP(240,$key));	# key1->rounds
   1141 	&and	("esp",-16);			# align stack
   1142 
   1143 	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
   1144 	&mov	(&DWP(16*6+4,"esp"),0);
   1145 	&mov	(&DWP(16*6+8,"esp"),1);
   1146 	&mov	(&DWP(16*6+12,"esp"),0);
   1147 	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
   1148 	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
   1149 
   1150 	&movdqa	($tweak,$inout0);
   1151 	&pxor	($twtmp,$twtmp);
   1152 	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
   1153 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1154 
   1155 	&and	($len,-16);
   1156 	&mov	($key_,$key);			# backup $key
   1157 	&mov	($rounds_,$rounds);		# backup $rounds
   1158 	&sub	($len,16*6);
   1159 	&jc	(&label("xts_enc_short"));
   1160 
   1161 	&shl	($rounds,4);
   1162 	&mov	($rounds_,16);
   1163 	&sub	($rounds_,$rounds);
   1164 	&lea	($key,&DWP(32,$key,$rounds));
   1165 	&jmp	(&label("xts_enc_loop6"));
   1166 
   1167 &set_label("xts_enc_loop6",16);
   1168 	for ($i=0;$i<4;$i++) {
   1169 	    &pshufd	($twres,$twtmp,0x13);
   1170 	    &pxor	($twtmp,$twtmp);
   1171 	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
   1172 	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
   1173 	    &pand	($twres,$twmask);	# isolate carry and residue
   1174 	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
   1175 	    &pxor	($tweak,$twres);
   1176 	}
   1177 	&pshufd	($inout5,$twtmp,0x13);
   1178 	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
   1179 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1180 	 &$movekey	($rndkey0,&QWP(0,$key_));
   1181 	&pand	($inout5,$twmask);		# isolate carry and residue
   1182 	 &movups	($inout0,&QWP(0,$inp));	# load input
   1183 	&pxor	($inout5,$tweak);
   1184 
   1185 	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
   1186 	&mov	($rounds,$rounds_);		# restore $rounds
   1187 	&movdqu	($inout1,&QWP(16*1,$inp));
   1188 	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
   1189 	&movdqu	($inout2,&QWP(16*2,$inp));
   1190 	 &pxor		($inout1,$rndkey0);
   1191 	&movdqu	($inout3,&QWP(16*3,$inp));
   1192 	 &pxor		($inout2,$rndkey0);
   1193 	&movdqu	($inout4,&QWP(16*4,$inp));
   1194 	 &pxor		($inout3,$rndkey0);
   1195 	&movdqu	($rndkey1,&QWP(16*5,$inp));
   1196 	 &pxor		($inout4,$rndkey0);
   1197 	&lea	($inp,&DWP(16*6,$inp));
   1198 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1199 	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
   1200 	&pxor	($inout5,$rndkey1);
   1201 
   1202 	 &$movekey	($rndkey1,&QWP(16,$key_));
   1203 	&pxor	($inout1,&QWP(16*1,"esp"));
   1204 	&pxor	($inout2,&QWP(16*2,"esp"));
   1205 	 &aesenc	($inout0,$rndkey1);
   1206 	&pxor	($inout3,&QWP(16*3,"esp"));
   1207 	&pxor	($inout4,&QWP(16*4,"esp"));
   1208 	 &aesenc	($inout1,$rndkey1);
   1209 	&pxor		($inout5,$rndkey0);
   1210 	 &$movekey	($rndkey0,&QWP(32,$key_));
   1211 	 &aesenc	($inout2,$rndkey1);
   1212 	 &aesenc	($inout3,$rndkey1);
   1213 	 &aesenc	($inout4,$rndkey1);
   1214 	 &aesenc	($inout5,$rndkey1);
   1215 	&call		(&label("_aesni_encrypt6_enter"));
   1216 
   1217 	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
   1218        &pxor	($twtmp,$twtmp);
   1219 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1220        &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
   1221 	&xorps	($inout1,&QWP(16*1,"esp"));
   1222 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1223 	&xorps	($inout2,&QWP(16*2,"esp"));
   1224 	&movups	(&QWP(16*1,$out),$inout1);
   1225 	&xorps	($inout3,&QWP(16*3,"esp"));
   1226 	&movups	(&QWP(16*2,$out),$inout2);
   1227 	&xorps	($inout4,&QWP(16*4,"esp"));
   1228 	&movups	(&QWP(16*3,$out),$inout3);
   1229 	&xorps	($inout5,$tweak);
   1230 	&movups	(&QWP(16*4,$out),$inout4);
   1231        &pshufd	($twres,$twtmp,0x13);
   1232 	&movups	(&QWP(16*5,$out),$inout5);
   1233 	&lea	($out,&DWP(16*6,$out));
   1234        &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
   1235 
   1236 	&pxor	($twtmp,$twtmp);
   1237 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1238 	&pand	($twres,$twmask);		# isolate carry and residue
   1239 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1240 	&pxor	($tweak,$twres);
   1241 
   1242 	&sub	($len,16*6);
   1243 	&jnc	(&label("xts_enc_loop6"));
   1244 
   1245 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1246 	&mov	($key,$key_);			# restore $key
   1247 	&mov	($rounds_,$rounds);
   1248 
   1249 &set_label("xts_enc_short");
   1250 	&add	($len,16*6);
   1251 	&jz	(&label("xts_enc_done6x"));
   1252 
   1253 	&movdqa	($inout3,$tweak);		# put aside previous tweak
   1254 	&cmp	($len,0x20);
   1255 	&jb	(&label("xts_enc_one"));
   1256 
   1257 	&pshufd	($twres,$twtmp,0x13);
   1258 	&pxor	($twtmp,$twtmp);
   1259 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1260 	&pand	($twres,$twmask);		# isolate carry and residue
   1261 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1262 	&pxor	($tweak,$twres);
   1263 	&je	(&label("xts_enc_two"));
   1264 
   1265 	&pshufd	($twres,$twtmp,0x13);
   1266 	&pxor	($twtmp,$twtmp);
   1267 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1268 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1269 	&pand	($twres,$twmask);		# isolate carry and residue
   1270 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1271 	&pxor	($tweak,$twres);
   1272 	&cmp	($len,0x40);
   1273 	&jb	(&label("xts_enc_three"));
   1274 
   1275 	&pshufd	($twres,$twtmp,0x13);
   1276 	&pxor	($twtmp,$twtmp);
   1277 	&movdqa	($inout5,$tweak);		# put aside previous tweak
   1278 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1279 	&pand	($twres,$twmask);		# isolate carry and residue
   1280 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1281 	&pxor	($tweak,$twres);
   1282 	&movdqa	(&QWP(16*0,"esp"),$inout3);
   1283 	&movdqa	(&QWP(16*1,"esp"),$inout4);
   1284 	&je	(&label("xts_enc_four"));
   1285 
   1286 	&movdqa	(&QWP(16*2,"esp"),$inout5);
   1287 	&pshufd	($inout5,$twtmp,0x13);
   1288 	&movdqa	(&QWP(16*3,"esp"),$tweak);
   1289 	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
   1290 	&pand	($inout5,$twmask);		# isolate carry and residue
   1291 	&pxor	($inout5,$tweak);
   1292 
   1293 	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
   1294 	&movdqu	($inout1,&QWP(16*1,$inp));
   1295 	&movdqu	($inout2,&QWP(16*2,$inp));
   1296 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1297 	&movdqu	($inout3,&QWP(16*3,$inp));
   1298 	&pxor	($inout1,&QWP(16*1,"esp"));
   1299 	&movdqu	($inout4,&QWP(16*4,$inp));
   1300 	&pxor	($inout2,&QWP(16*2,"esp"));
   1301 	&lea	($inp,&DWP(16*5,$inp));
   1302 	&pxor	($inout3,&QWP(16*3,"esp"));
   1303 	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
   1304 	&pxor	($inout4,$inout5);
   1305 
   1306 	&call	("_aesni_encrypt6");
   1307 
   1308 	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
   1309 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1310 	&xorps	($inout1,&QWP(16*1,"esp"));
   1311 	&xorps	($inout2,&QWP(16*2,"esp"));
   1312 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1313 	&xorps	($inout3,&QWP(16*3,"esp"));
   1314 	&movups	(&QWP(16*1,$out),$inout1);
   1315 	&xorps	($inout4,$tweak);
   1316 	&movups	(&QWP(16*2,$out),$inout2);
   1317 	&movups	(&QWP(16*3,$out),$inout3);
   1318 	&movups	(&QWP(16*4,$out),$inout4);
   1319 	&lea	($out,&DWP(16*5,$out));
   1320 	&jmp	(&label("xts_enc_done"));
   1321 
   1322 &set_label("xts_enc_one",16);
   1323 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1324 	&lea	($inp,&DWP(16*1,$inp));
   1325 	&xorps	($inout0,$inout3);		# input^=tweak
   1326 	if ($inline)
   1327 	{   &aesni_inline_generate1("enc");	}
   1328 	else
   1329 	{   &call	("_aesni_encrypt1");	}
   1330 	&xorps	($inout0,$inout3);		# output^=tweak
   1331 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1332 	&lea	($out,&DWP(16*1,$out));
   1333 
   1334 	&movdqa	($tweak,$inout3);		# last tweak
   1335 	&jmp	(&label("xts_enc_done"));
   1336 
   1337 &set_label("xts_enc_two",16);
   1338 	&movaps	($inout4,$tweak);		# put aside last tweak
   1339 
   1340 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1341 	&movups	($inout1,&QWP(16*1,$inp));
   1342 	&lea	($inp,&DWP(16*2,$inp));
   1343 	&xorps	($inout0,$inout3);		# input^=tweak
   1344 	&xorps	($inout1,$inout4);
   1345 
   1346 	&call	("_aesni_encrypt2");
   1347 
   1348 	&xorps	($inout0,$inout3);		# output^=tweak
   1349 	&xorps	($inout1,$inout4);
   1350 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1351 	&movups	(&QWP(16*1,$out),$inout1);
   1352 	&lea	($out,&DWP(16*2,$out));
   1353 
   1354 	&movdqa	($tweak,$inout4);		# last tweak
   1355 	&jmp	(&label("xts_enc_done"));
   1356 
   1357 &set_label("xts_enc_three",16);
   1358 	&movaps	($inout5,$tweak);		# put aside last tweak
   1359 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1360 	&movups	($inout1,&QWP(16*1,$inp));
   1361 	&movups	($inout2,&QWP(16*2,$inp));
   1362 	&lea	($inp,&DWP(16*3,$inp));
   1363 	&xorps	($inout0,$inout3);		# input^=tweak
   1364 	&xorps	($inout1,$inout4);
   1365 	&xorps	($inout2,$inout5);
   1366 
   1367 	&call	("_aesni_encrypt3");
   1368 
   1369 	&xorps	($inout0,$inout3);		# output^=tweak
   1370 	&xorps	($inout1,$inout4);
   1371 	&xorps	($inout2,$inout5);
   1372 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1373 	&movups	(&QWP(16*1,$out),$inout1);
   1374 	&movups	(&QWP(16*2,$out),$inout2);
   1375 	&lea	($out,&DWP(16*3,$out));
   1376 
   1377 	&movdqa	($tweak,$inout5);		# last tweak
   1378 	&jmp	(&label("xts_enc_done"));
   1379 
   1380 &set_label("xts_enc_four",16);
   1381 	&movaps	($inout4,$tweak);		# put aside last tweak
   1382 
   1383 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1384 	&movups	($inout1,&QWP(16*1,$inp));
   1385 	&movups	($inout2,&QWP(16*2,$inp));
   1386 	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1387 	&movups	($inout3,&QWP(16*3,$inp));
   1388 	&lea	($inp,&DWP(16*4,$inp));
   1389 	&xorps	($inout1,&QWP(16*1,"esp"));
   1390 	&xorps	($inout2,$inout5);
   1391 	&xorps	($inout3,$inout4);
   1392 
   1393 	&call	("_aesni_encrypt4");
   1394 
   1395 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1396 	&xorps	($inout1,&QWP(16*1,"esp"));
   1397 	&xorps	($inout2,$inout5);
   1398 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1399 	&xorps	($inout3,$inout4);
   1400 	&movups	(&QWP(16*1,$out),$inout1);
   1401 	&movups	(&QWP(16*2,$out),$inout2);
   1402 	&movups	(&QWP(16*3,$out),$inout3);
   1403 	&lea	($out,&DWP(16*4,$out));
   1404 
   1405 	&movdqa	($tweak,$inout4);		# last tweak
   1406 	&jmp	(&label("xts_enc_done"));
   1407 
   1408 &set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
   1409 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1410 	&and	($len,15);
   1411 	&jz	(&label("xts_enc_ret"));
   1412 	&movdqa	($inout3,$tweak);
   1413 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1414 	&jmp	(&label("xts_enc_steal"));
   1415 
   1416 &set_label("xts_enc_done",16);
   1417 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1418 	&pxor	($twtmp,$twtmp);
   1419 	&and	($len,15);
   1420 	&jz	(&label("xts_enc_ret"));
   1421 
   1422 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1423 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1424 	&pshufd	($inout3,$twtmp,0x13);
   1425 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1426 	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
   1427 	&pxor	($inout3,$tweak);
   1428 
   1429 &set_label("xts_enc_steal");
   1430 	&movz	($rounds,&BP(0,$inp));
   1431 	&movz	($key,&BP(-16,$out));
   1432 	&lea	($inp,&DWP(1,$inp));
   1433 	&mov	(&BP(-16,$out),&LB($rounds));
   1434 	&mov	(&BP(0,$out),&LB($key));
   1435 	&lea	($out,&DWP(1,$out));
   1436 	&sub	($len,1);
   1437 	&jnz	(&label("xts_enc_steal"));
   1438 
   1439 	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
   1440 	&mov	($key,$key_);			# restore $key
   1441 	&mov	($rounds,$rounds_);		# restore $rounds
   1442 
   1443 	&movups	($inout0,&QWP(-16,$out));	# load input
   1444 	&xorps	($inout0,$inout3);		# input^=tweak
   1445 	if ($inline)
   1446 	{   &aesni_inline_generate1("enc");	}
   1447 	else
   1448 	{   &call	("_aesni_encrypt1");	}
   1449 	&xorps	($inout0,$inout3);		# output^=tweak
   1450 	&movups	(&QWP(-16,$out),$inout0);	# write output
   1451 
   1452 &set_label("xts_enc_ret");
   1453 	&pxor	("xmm0","xmm0");		# clear register bank
   1454 	&pxor	("xmm1","xmm1");
   1455 	&pxor	("xmm2","xmm2");
   1456 	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
   1457 	&pxor	("xmm3","xmm3");
   1458 	&movdqa	(&QWP(16*1,"esp"),"xmm0");
   1459 	&pxor	("xmm4","xmm4");
   1460 	&movdqa	(&QWP(16*2,"esp"),"xmm0");
   1461 	&pxor	("xmm5","xmm5");
   1462 	&movdqa	(&QWP(16*3,"esp"),"xmm0");
   1463 	&pxor	("xmm6","xmm6");
   1464 	&movdqa	(&QWP(16*4,"esp"),"xmm0");
   1465 	&pxor	("xmm7","xmm7");
   1466 	&movdqa	(&QWP(16*5,"esp"),"xmm0");
   1467 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
   1468 &function_end("aesni_xts_encrypt");
   1469 
   1470 &function_begin("aesni_xts_decrypt");
   1471 	&mov	($key,&wparam(4));		# key2
   1472 	&mov	($inp,&wparam(5));		# clear-text tweak
   1473 
   1474 	&mov	($rounds,&DWP(240,$key));	# key2->rounds
   1475 	&movups	($inout0,&QWP(0,$inp));
   1476 	if ($inline)
   1477 	{   &aesni_inline_generate1("enc");	}
   1478 	else
   1479 	{   &call	("_aesni_encrypt1");	}
   1480 
   1481 	&mov	($inp,&wparam(0));
   1482 	&mov	($out,&wparam(1));
   1483 	&mov	($len,&wparam(2));
   1484 	&mov	($key,&wparam(3));		# key1
   1485 
   1486 	&mov	($key_,"esp");
   1487 	&sub	("esp",16*7+8);
   1488 	&and	("esp",-16);			# align stack
   1489 
   1490 	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
   1491 	&test	($len,15);
   1492 	&setnz	(&LB($rounds_));
   1493 	&shl	($rounds_,4);
   1494 	&sub	($len,$rounds_);
   1495 
   1496 	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
   1497 	&mov	(&DWP(16*6+4,"esp"),0);
   1498 	&mov	(&DWP(16*6+8,"esp"),1);
   1499 	&mov	(&DWP(16*6+12,"esp"),0);
   1500 	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
   1501 	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
   1502 
   1503 	&mov	($rounds,&DWP(240,$key));	# key1->rounds
   1504 	&mov	($key_,$key);			# backup $key
   1505 	&mov	($rounds_,$rounds);		# backup $rounds
   1506 
   1507 	&movdqa	($tweak,$inout0);
   1508 	&pxor	($twtmp,$twtmp);
   1509 	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
   1510 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1511 
   1512 	&and	($len,-16);
   1513 	&sub	($len,16*6);
   1514 	&jc	(&label("xts_dec_short"));
   1515 
   1516 	&shl	($rounds,4);
   1517 	&mov	($rounds_,16);
   1518 	&sub	($rounds_,$rounds);
   1519 	&lea	($key,&DWP(32,$key,$rounds));
   1520 	&jmp	(&label("xts_dec_loop6"));
   1521 
   1522 &set_label("xts_dec_loop6",16);
   1523 	for ($i=0;$i<4;$i++) {
   1524 	    &pshufd	($twres,$twtmp,0x13);
   1525 	    &pxor	($twtmp,$twtmp);
   1526 	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
   1527 	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
   1528 	    &pand	($twres,$twmask);	# isolate carry and residue
   1529 	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
   1530 	    &pxor	($tweak,$twres);
   1531 	}
   1532 	&pshufd	($inout5,$twtmp,0x13);
   1533 	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
   1534 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1535 	 &$movekey	($rndkey0,&QWP(0,$key_));
   1536 	&pand	($inout5,$twmask);		# isolate carry and residue
   1537 	 &movups	($inout0,&QWP(0,$inp));	# load input
   1538 	&pxor	($inout5,$tweak);
   1539 
   1540 	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
   1541 	&mov	($rounds,$rounds_);
   1542 	&movdqu	($inout1,&QWP(16*1,$inp));
   1543 	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
   1544 	&movdqu	($inout2,&QWP(16*2,$inp));
   1545 	 &pxor		($inout1,$rndkey0);
   1546 	&movdqu	($inout3,&QWP(16*3,$inp));
   1547 	 &pxor		($inout2,$rndkey0);
   1548 	&movdqu	($inout4,&QWP(16*4,$inp));
   1549 	 &pxor		($inout3,$rndkey0);
   1550 	&movdqu	($rndkey1,&QWP(16*5,$inp));
   1551 	 &pxor		($inout4,$rndkey0);
   1552 	&lea	($inp,&DWP(16*6,$inp));
   1553 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1554 	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
   1555 	&pxor	($inout5,$rndkey1);
   1556 
   1557 	 &$movekey	($rndkey1,&QWP(16,$key_));
   1558 	&pxor	($inout1,&QWP(16*1,"esp"));
   1559 	&pxor	($inout2,&QWP(16*2,"esp"));
   1560 	 &aesdec	($inout0,$rndkey1);
   1561 	&pxor	($inout3,&QWP(16*3,"esp"));
   1562 	&pxor	($inout4,&QWP(16*4,"esp"));
   1563 	 &aesdec	($inout1,$rndkey1);
   1564 	&pxor		($inout5,$rndkey0);
   1565 	 &$movekey	($rndkey0,&QWP(32,$key_));
   1566 	 &aesdec	($inout2,$rndkey1);
   1567 	 &aesdec	($inout3,$rndkey1);
   1568 	 &aesdec	($inout4,$rndkey1);
   1569 	 &aesdec	($inout5,$rndkey1);
   1570 	&call		(&label("_aesni_decrypt6_enter"));
   1571 
   1572 	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
   1573        &pxor	($twtmp,$twtmp);
   1574 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1575        &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
   1576 	&xorps	($inout1,&QWP(16*1,"esp"));
   1577 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1578 	&xorps	($inout2,&QWP(16*2,"esp"));
   1579 	&movups	(&QWP(16*1,$out),$inout1);
   1580 	&xorps	($inout3,&QWP(16*3,"esp"));
   1581 	&movups	(&QWP(16*2,$out),$inout2);
   1582 	&xorps	($inout4,&QWP(16*4,"esp"));
   1583 	&movups	(&QWP(16*3,$out),$inout3);
   1584 	&xorps	($inout5,$tweak);
   1585 	&movups	(&QWP(16*4,$out),$inout4);
   1586        &pshufd	($twres,$twtmp,0x13);
   1587 	&movups	(&QWP(16*5,$out),$inout5);
   1588 	&lea	($out,&DWP(16*6,$out));
   1589        &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
   1590 
   1591 	&pxor	($twtmp,$twtmp);
   1592 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1593 	&pand	($twres,$twmask);		# isolate carry and residue
   1594 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1595 	&pxor	($tweak,$twres);
   1596 
   1597 	&sub	($len,16*6);
   1598 	&jnc	(&label("xts_dec_loop6"));
   1599 
   1600 	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
   1601 	&mov	($key,$key_);			# restore $key
   1602 	&mov	($rounds_,$rounds);
   1603 
   1604 &set_label("xts_dec_short");
   1605 	&add	($len,16*6);
   1606 	&jz	(&label("xts_dec_done6x"));
   1607 
   1608 	&movdqa	($inout3,$tweak);		# put aside previous tweak
   1609 	&cmp	($len,0x20);
   1610 	&jb	(&label("xts_dec_one"));
   1611 
   1612 	&pshufd	($twres,$twtmp,0x13);
   1613 	&pxor	($twtmp,$twtmp);
   1614 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1615 	&pand	($twres,$twmask);		# isolate carry and residue
   1616 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1617 	&pxor	($tweak,$twres);
   1618 	&je	(&label("xts_dec_two"));
   1619 
   1620 	&pshufd	($twres,$twtmp,0x13);
   1621 	&pxor	($twtmp,$twtmp);
   1622 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1623 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1624 	&pand	($twres,$twmask);		# isolate carry and residue
   1625 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1626 	&pxor	($tweak,$twres);
   1627 	&cmp	($len,0x40);
   1628 	&jb	(&label("xts_dec_three"));
   1629 
   1630 	&pshufd	($twres,$twtmp,0x13);
   1631 	&pxor	($twtmp,$twtmp);
   1632 	&movdqa	($inout5,$tweak);		# put aside previous tweak
   1633 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1634 	&pand	($twres,$twmask);		# isolate carry and residue
   1635 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1636 	&pxor	($tweak,$twres);
   1637 	&movdqa	(&QWP(16*0,"esp"),$inout3);
   1638 	&movdqa	(&QWP(16*1,"esp"),$inout4);
   1639 	&je	(&label("xts_dec_four"));
   1640 
   1641 	&movdqa	(&QWP(16*2,"esp"),$inout5);
   1642 	&pshufd	($inout5,$twtmp,0x13);
   1643 	&movdqa	(&QWP(16*3,"esp"),$tweak);
   1644 	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
   1645 	&pand	($inout5,$twmask);		# isolate carry and residue
   1646 	&pxor	($inout5,$tweak);
   1647 
   1648 	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
   1649 	&movdqu	($inout1,&QWP(16*1,$inp));
   1650 	&movdqu	($inout2,&QWP(16*2,$inp));
   1651 	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1652 	&movdqu	($inout3,&QWP(16*3,$inp));
   1653 	&pxor	($inout1,&QWP(16*1,"esp"));
   1654 	&movdqu	($inout4,&QWP(16*4,$inp));
   1655 	&pxor	($inout2,&QWP(16*2,"esp"));
   1656 	&lea	($inp,&DWP(16*5,$inp));
   1657 	&pxor	($inout3,&QWP(16*3,"esp"));
   1658 	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
   1659 	&pxor	($inout4,$inout5);
   1660 
   1661 	&call	("_aesni_decrypt6");
   1662 
   1663 	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
   1664 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1665 	&xorps	($inout1,&QWP(16*1,"esp"));
   1666 	&xorps	($inout2,&QWP(16*2,"esp"));
   1667 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1668 	&xorps	($inout3,&QWP(16*3,"esp"));
   1669 	&movups	(&QWP(16*1,$out),$inout1);
   1670 	&xorps	($inout4,$tweak);
   1671 	&movups	(&QWP(16*2,$out),$inout2);
   1672 	&movups	(&QWP(16*3,$out),$inout3);
   1673 	&movups	(&QWP(16*4,$out),$inout4);
   1674 	&lea	($out,&DWP(16*5,$out));
   1675 	&jmp	(&label("xts_dec_done"));
   1676 
   1677 &set_label("xts_dec_one",16);
   1678 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1679 	&lea	($inp,&DWP(16*1,$inp));
   1680 	&xorps	($inout0,$inout3);		# input^=tweak
   1681 	if ($inline)
   1682 	{   &aesni_inline_generate1("dec");	}
   1683 	else
   1684 	{   &call	("_aesni_decrypt1");	}
   1685 	&xorps	($inout0,$inout3);		# output^=tweak
   1686 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1687 	&lea	($out,&DWP(16*1,$out));
   1688 
   1689 	&movdqa	($tweak,$inout3);		# last tweak
   1690 	&jmp	(&label("xts_dec_done"));
   1691 
   1692 &set_label("xts_dec_two",16);
   1693 	&movaps	($inout4,$tweak);		# put aside last tweak
   1694 
   1695 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1696 	&movups	($inout1,&QWP(16*1,$inp));
   1697 	&lea	($inp,&DWP(16*2,$inp));
   1698 	&xorps	($inout0,$inout3);		# input^=tweak
   1699 	&xorps	($inout1,$inout4);
   1700 
   1701 	&call	("_aesni_decrypt2");
   1702 
   1703 	&xorps	($inout0,$inout3);		# output^=tweak
   1704 	&xorps	($inout1,$inout4);
   1705 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1706 	&movups	(&QWP(16*1,$out),$inout1);
   1707 	&lea	($out,&DWP(16*2,$out));
   1708 
   1709 	&movdqa	($tweak,$inout4);		# last tweak
   1710 	&jmp	(&label("xts_dec_done"));
   1711 
   1712 &set_label("xts_dec_three",16);
   1713 	&movaps	($inout5,$tweak);		# put aside last tweak
   1714 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1715 	&movups	($inout1,&QWP(16*1,$inp));
   1716 	&movups	($inout2,&QWP(16*2,$inp));
   1717 	&lea	($inp,&DWP(16*3,$inp));
   1718 	&xorps	($inout0,$inout3);		# input^=tweak
   1719 	&xorps	($inout1,$inout4);
   1720 	&xorps	($inout2,$inout5);
   1721 
   1722 	&call	("_aesni_decrypt3");
   1723 
   1724 	&xorps	($inout0,$inout3);		# output^=tweak
   1725 	&xorps	($inout1,$inout4);
   1726 	&xorps	($inout2,$inout5);
   1727 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1728 	&movups	(&QWP(16*1,$out),$inout1);
   1729 	&movups	(&QWP(16*2,$out),$inout2);
   1730 	&lea	($out,&DWP(16*3,$out));
   1731 
   1732 	&movdqa	($tweak,$inout5);		# last tweak
   1733 	&jmp	(&label("xts_dec_done"));
   1734 
   1735 &set_label("xts_dec_four",16);
   1736 	&movaps	($inout4,$tweak);		# put aside last tweak
   1737 
   1738 	&movups	($inout0,&QWP(16*0,$inp));	# load input
   1739 	&movups	($inout1,&QWP(16*1,$inp));
   1740 	&movups	($inout2,&QWP(16*2,$inp));
   1741 	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
   1742 	&movups	($inout3,&QWP(16*3,$inp));
   1743 	&lea	($inp,&DWP(16*4,$inp));
   1744 	&xorps	($inout1,&QWP(16*1,"esp"));
   1745 	&xorps	($inout2,$inout5);
   1746 	&xorps	($inout3,$inout4);
   1747 
   1748 	&call	("_aesni_decrypt4");
   1749 
   1750 	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
   1751 	&xorps	($inout1,&QWP(16*1,"esp"));
   1752 	&xorps	($inout2,$inout5);
   1753 	&movups	(&QWP(16*0,$out),$inout0);	# write output
   1754 	&xorps	($inout3,$inout4);
   1755 	&movups	(&QWP(16*1,$out),$inout1);
   1756 	&movups	(&QWP(16*2,$out),$inout2);
   1757 	&movups	(&QWP(16*3,$out),$inout3);
   1758 	&lea	($out,&DWP(16*4,$out));
   1759 
   1760 	&movdqa	($tweak,$inout4);		# last tweak
   1761 	&jmp	(&label("xts_dec_done"));
   1762 
   1763 &set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
   1764 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1765 	&and	($len,15);
   1766 	&jz	(&label("xts_dec_ret"));
   1767 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1768 	&jmp	(&label("xts_dec_only_one_more"));
   1769 
   1770 &set_label("xts_dec_done",16);
   1771 	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
   1772 	&pxor	($twtmp,$twtmp);
   1773 	&and	($len,15);
   1774 	&jz	(&label("xts_dec_ret"));
   1775 
   1776 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1777 	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
   1778 	&pshufd	($twres,$twtmp,0x13);
   1779 	&pxor	($twtmp,$twtmp);
   1780 	&movdqa	($twmask,&QWP(16*6,"esp"));
   1781 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1782 	&pand	($twres,$twmask);		# isolate carry and residue
   1783 	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
   1784 	&pxor	($tweak,$twres);
   1785 
   1786 &set_label("xts_dec_only_one_more");
   1787 	&pshufd	($inout3,$twtmp,0x13);
   1788 	&movdqa	($inout4,$tweak);		# put aside previous tweak
   1789 	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
   1790 	&pand	($inout3,$twmask);		# isolate carry and residue
   1791 	&pxor	($inout3,$tweak);
   1792 
   1793 	&mov	($key,$key_);			# restore $key
   1794 	&mov	($rounds,$rounds_);		# restore $rounds
   1795 
   1796 	&movups	($inout0,&QWP(0,$inp));		# load input
   1797 	&xorps	($inout0,$inout3);		# input^=tweak
   1798 	if ($inline)
   1799 	{   &aesni_inline_generate1("dec");	}
   1800 	else
   1801 	{   &call	("_aesni_decrypt1");	}
   1802 	&xorps	($inout0,$inout3);		# output^=tweak
   1803 	&movups	(&QWP(0,$out),$inout0);		# write output
   1804 
   1805 &set_label("xts_dec_steal");
   1806 	&movz	($rounds,&BP(16,$inp));
   1807 	&movz	($key,&BP(0,$out));
   1808 	&lea	($inp,&DWP(1,$inp));
   1809 	&mov	(&BP(0,$out),&LB($rounds));
   1810 	&mov	(&BP(16,$out),&LB($key));
   1811 	&lea	($out,&DWP(1,$out));
   1812 	&sub	($len,1);
   1813 	&jnz	(&label("xts_dec_steal"));
   1814 
   1815 	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
   1816 	&mov	($key,$key_);			# restore $key
   1817 	&mov	($rounds,$rounds_);		# restore $rounds
   1818 
   1819 	&movups	($inout0,&QWP(0,$out));		# load input
   1820 	&xorps	($inout0,$inout4);		# input^=tweak
   1821 	if ($inline)
   1822 	{   &aesni_inline_generate1("dec");	}
   1823 	else
   1824 	{   &call	("_aesni_decrypt1");	}
   1825 	&xorps	($inout0,$inout4);		# output^=tweak
   1826 	&movups	(&QWP(0,$out),$inout0);		# write output
   1827 
   1828 &set_label("xts_dec_ret");
   1829 	&pxor	("xmm0","xmm0");		# clear register bank
   1830 	&pxor	("xmm1","xmm1");
   1831 	&pxor	("xmm2","xmm2");
   1832 	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
   1833 	&pxor	("xmm3","xmm3");
   1834 	&movdqa	(&QWP(16*1,"esp"),"xmm0");
   1835 	&pxor	("xmm4","xmm4");
   1836 	&movdqa	(&QWP(16*2,"esp"),"xmm0");
   1837 	&pxor	("xmm5","xmm5");
   1838 	&movdqa	(&QWP(16*3,"esp"),"xmm0");
   1839 	&pxor	("xmm6","xmm6");
   1840 	&movdqa	(&QWP(16*4,"esp"),"xmm0");
   1841 	&pxor	("xmm7","xmm7");
   1842 	&movdqa	(&QWP(16*5,"esp"),"xmm0");
   1843 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
   1844 &function_end("aesni_xts_decrypt");
   1845 }
   1846 }
   1847 
   1849 ######################################################################
   1850 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
   1851 #                           size_t length, const AES_KEY *key,
   1852 #                           unsigned char *ivp,const int enc);
   1853 &function_begin("${PREFIX}_cbc_encrypt");
   1854 	&mov	($inp,&wparam(0));
   1855 	&mov	($rounds_,"esp");
   1856 	&mov	($out,&wparam(1));
   1857 	&sub	($rounds_,24);
   1858 	&mov	($len,&wparam(2));
   1859 	&and	($rounds_,-16);
   1860 	&mov	($key,&wparam(3));
   1861 	&mov	($key_,&wparam(4));
   1862 	&test	($len,$len);
   1863 	&jz	(&label("cbc_abort"));
   1864 
   1865 	&cmp	(&wparam(5),0);
   1866 	&xchg	($rounds_,"esp");		# alloca
   1867 	&movups	($ivec,&QWP(0,$key_));		# load IV
   1868 	&mov	($rounds,&DWP(240,$key));
   1869 	&mov	($key_,$key);			# backup $key
   1870 	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
   1871 	&mov	($rounds_,$rounds);		# backup $rounds
   1872 	&je	(&label("cbc_decrypt"));
   1873 
   1874 	&movaps	($inout0,$ivec);
   1875 	&cmp	($len,16);
   1876 	&jb	(&label("cbc_enc_tail"));
   1877 	&sub	($len,16);
   1878 	&jmp	(&label("cbc_enc_loop"));
   1879 
   1880 &set_label("cbc_enc_loop",16);
   1881 	&movups	($ivec,&QWP(0,$inp));		# input actually
   1882 	&lea	($inp,&DWP(16,$inp));
   1883 	if ($inline)
   1884 	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
   1885 	else
   1886 	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
   1887 	&mov	($rounds,$rounds_);	# restore $rounds
   1888 	&mov	($key,$key_);		# restore $key
   1889 	&movups	(&QWP(0,$out),$inout0);	# store output
   1890 	&lea	($out,&DWP(16,$out));
   1891 	&sub	($len,16);
   1892 	&jnc	(&label("cbc_enc_loop"));
   1893 	&add	($len,16);
   1894 	&jnz	(&label("cbc_enc_tail"));
   1895 	&movaps	($ivec,$inout0);
   1896 	&pxor	($inout0,$inout0);
   1897 	&jmp	(&label("cbc_ret"));
   1898 
   1899 &set_label("cbc_enc_tail");
   1900 	&mov	("ecx",$len);		# zaps $rounds
   1901 	&data_word(0xA4F3F689);		# rep movsb
   1902 	&mov	("ecx",16);		# zero tail
   1903 	&sub	("ecx",$len);
   1904 	&xor	("eax","eax");		# zaps $len
   1905 	&data_word(0xAAF3F689);		# rep stosb
   1906 	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
   1907 	&mov	($rounds,$rounds_);	# restore $rounds
   1908 	&mov	($inp,$out);		# $inp and $out are the same
   1909 	&mov	($key,$key_);		# restore $key
   1910 	&jmp	(&label("cbc_enc_loop"));
   1911 ######################################################################
   1912 &set_label("cbc_decrypt",16);
   1913 	&cmp	($len,0x50);
   1914 	&jbe	(&label("cbc_dec_tail"));
   1915 	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
   1916 	&sub	($len,0x50);
   1917 	&jmp	(&label("cbc_dec_loop6_enter"));
   1918 
   1919 &set_label("cbc_dec_loop6",16);
   1920 	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
   1921 	&movups	(&QWP(0,$out),$inout5);
   1922 	&lea	($out,&DWP(0x10,$out));
   1923 &set_label("cbc_dec_loop6_enter");
   1924 	&movdqu	($inout0,&QWP(0,$inp));
   1925 	&movdqu	($inout1,&QWP(0x10,$inp));
   1926 	&movdqu	($inout2,&QWP(0x20,$inp));
   1927 	&movdqu	($inout3,&QWP(0x30,$inp));
   1928 	&movdqu	($inout4,&QWP(0x40,$inp));
   1929 	&movdqu	($inout5,&QWP(0x50,$inp));
   1930 
   1931 	&call	("_aesni_decrypt6");
   1932 
   1933 	&movups	($rndkey1,&QWP(0,$inp));
   1934 	&movups	($rndkey0,&QWP(0x10,$inp));
   1935 	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
   1936 	&xorps	($inout1,$rndkey1);
   1937 	&movups	($rndkey1,&QWP(0x20,$inp));
   1938 	&xorps	($inout2,$rndkey0);
   1939 	&movups	($rndkey0,&QWP(0x30,$inp));
   1940 	&xorps	($inout3,$rndkey1);
   1941 	&movups	($rndkey1,&QWP(0x40,$inp));
   1942 	&xorps	($inout4,$rndkey0);
   1943 	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
   1944 	&xorps	($inout5,$rndkey1);
   1945 	&movups	(&QWP(0,$out),$inout0);
   1946 	&movups	(&QWP(0x10,$out),$inout1);
   1947 	&lea	($inp,&DWP(0x60,$inp));
   1948 	&movups	(&QWP(0x20,$out),$inout2);
   1949 	&mov	($rounds,$rounds_);		# restore $rounds
   1950 	&movups	(&QWP(0x30,$out),$inout3);
   1951 	&mov	($key,$key_);			# restore $key
   1952 	&movups	(&QWP(0x40,$out),$inout4);
   1953 	&lea	($out,&DWP(0x50,$out));
   1954 	&sub	($len,0x60);
   1955 	&ja	(&label("cbc_dec_loop6"));
   1956 
   1957 	&movaps	($inout0,$inout5);
   1958 	&movaps	($ivec,$rndkey0);
   1959 	&add	($len,0x50);
   1960 	&jle	(&label("cbc_dec_clear_tail_collected"));
   1961 	&movups	(&QWP(0,$out),$inout0);
   1962 	&lea	($out,&DWP(0x10,$out));
   1963 &set_label("cbc_dec_tail");
   1964 	&movups	($inout0,&QWP(0,$inp));
   1965 	&movaps	($in0,$inout0);
   1966 	&cmp	($len,0x10);
   1967 	&jbe	(&label("cbc_dec_one"));
   1968 
   1969 	&movups	($inout1,&QWP(0x10,$inp));
   1970 	&movaps	($in1,$inout1);
   1971 	&cmp	($len,0x20);
   1972 	&jbe	(&label("cbc_dec_two"));
   1973 
   1974 	&movups	($inout2,&QWP(0x20,$inp));
   1975 	&cmp	($len,0x30);
   1976 	&jbe	(&label("cbc_dec_three"));
   1977 
   1978 	&movups	($inout3,&QWP(0x30,$inp));
   1979 	&cmp	($len,0x40);
   1980 	&jbe	(&label("cbc_dec_four"));
   1981 
   1982 	&movups	($inout4,&QWP(0x40,$inp));
   1983 	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
   1984 	&movups	($inout0,&QWP(0,$inp));
   1985 	&xorps	($inout5,$inout5);
   1986 	&call	("_aesni_decrypt6");
   1987 	&movups	($rndkey1,&QWP(0,$inp));
   1988 	&movups	($rndkey0,&QWP(0x10,$inp));
   1989 	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
   1990 	&xorps	($inout1,$rndkey1);
   1991 	&movups	($rndkey1,&QWP(0x20,$inp));
   1992 	&xorps	($inout2,$rndkey0);
   1993 	&movups	($rndkey0,&QWP(0x30,$inp));
   1994 	&xorps	($inout3,$rndkey1);
   1995 	&movups	($ivec,&QWP(0x40,$inp));	# IV
   1996 	&xorps	($inout4,$rndkey0);
   1997 	&movups	(&QWP(0,$out),$inout0);
   1998 	&movups	(&QWP(0x10,$out),$inout1);
   1999 	&pxor	($inout1,$inout1);
   2000 	&movups	(&QWP(0x20,$out),$inout2);
   2001 	&pxor	($inout2,$inout2);
   2002 	&movups	(&QWP(0x30,$out),$inout3);
   2003 	&pxor	($inout3,$inout3);
   2004 	&lea	($out,&DWP(0x40,$out));
   2005 	&movaps	($inout0,$inout4);
   2006 	&pxor	($inout4,$inout4);
   2007 	&sub	($len,0x50);
   2008 	&jmp	(&label("cbc_dec_tail_collected"));
   2009 
   2010 &set_label("cbc_dec_one",16);
   2011 	if ($inline)
   2012 	{   &aesni_inline_generate1("dec");	}
   2013 	else
   2014 	{   &call	("_aesni_decrypt1");	}
   2015 	&xorps	($inout0,$ivec);
   2016 	&movaps	($ivec,$in0);
   2017 	&sub	($len,0x10);
   2018 	&jmp	(&label("cbc_dec_tail_collected"));
   2019 
   2020 &set_label("cbc_dec_two",16);
   2021 	&call	("_aesni_decrypt2");
   2022 	&xorps	($inout0,$ivec);
   2023 	&xorps	($inout1,$in0);
   2024 	&movups	(&QWP(0,$out),$inout0);
   2025 	&movaps	($inout0,$inout1);
   2026 	&pxor	($inout1,$inout1);
   2027 	&lea	($out,&DWP(0x10,$out));
   2028 	&movaps	($ivec,$in1);
   2029 	&sub	($len,0x20);
   2030 	&jmp	(&label("cbc_dec_tail_collected"));
   2031 
   2032 &set_label("cbc_dec_three",16);
   2033 	&call	("_aesni_decrypt3");
   2034 	&xorps	($inout0,$ivec);
   2035 	&xorps	($inout1,$in0);
   2036 	&xorps	($inout2,$in1);
   2037 	&movups	(&QWP(0,$out),$inout0);
   2038 	&movaps	($inout0,$inout2);
   2039 	&pxor	($inout2,$inout2);
   2040 	&movups	(&QWP(0x10,$out),$inout1);
   2041 	&pxor	($inout1,$inout1);
   2042 	&lea	($out,&DWP(0x20,$out));
   2043 	&movups	($ivec,&QWP(0x20,$inp));
   2044 	&sub	($len,0x30);
   2045 	&jmp	(&label("cbc_dec_tail_collected"));
   2046 
   2047 &set_label("cbc_dec_four",16);
   2048 	&call	("_aesni_decrypt4");
   2049 	&movups	($rndkey1,&QWP(0x10,$inp));
   2050 	&movups	($rndkey0,&QWP(0x20,$inp));
   2051 	&xorps	($inout0,$ivec);
   2052 	&movups	($ivec,&QWP(0x30,$inp));
   2053 	&xorps	($inout1,$in0);
   2054 	&movups	(&QWP(0,$out),$inout0);
   2055 	&xorps	($inout2,$rndkey1);
   2056 	&movups	(&QWP(0x10,$out),$inout1);
   2057 	&pxor	($inout1,$inout1);
   2058 	&xorps	($inout3,$rndkey0);
   2059 	&movups	(&QWP(0x20,$out),$inout2);
   2060 	&pxor	($inout2,$inout2);
   2061 	&lea	($out,&DWP(0x30,$out));
   2062 	&movaps	($inout0,$inout3);
   2063 	&pxor	($inout3,$inout3);
   2064 	&sub	($len,0x40);
   2065 	&jmp	(&label("cbc_dec_tail_collected"));
   2066 
   2067 &set_label("cbc_dec_clear_tail_collected",16);
   2068 	&pxor	($inout1,$inout1);
   2069 	&pxor	($inout2,$inout2);
   2070 	&pxor	($inout3,$inout3);
   2071 	&pxor	($inout4,$inout4);
   2072 &set_label("cbc_dec_tail_collected");
   2073 	&and	($len,15);
   2074 	&jnz	(&label("cbc_dec_tail_partial"));
   2075 	&movups	(&QWP(0,$out),$inout0);
   2076 	&pxor	($rndkey0,$rndkey0);
   2077 	&jmp	(&label("cbc_ret"));
   2078 
   2079 &set_label("cbc_dec_tail_partial",16);
   2080 	&movaps	(&QWP(0,"esp"),$inout0);
   2081 	&pxor	($rndkey0,$rndkey0);
   2082 	&mov	("ecx",16);
   2083 	&mov	($inp,"esp");
   2084 	&sub	("ecx",$len);
   2085 	&data_word(0xA4F3F689);		# rep movsb
   2086 	&movdqa	(&QWP(0,"esp"),$inout0);
   2087 
   2088 &set_label("cbc_ret");
   2089 	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
   2090 	&mov	($key_,&wparam(4));
   2091 	&pxor	($inout0,$inout0);
   2092 	&pxor	($rndkey1,$rndkey1);
   2093 	&movups	(&QWP(0,$key_),$ivec);	# output IV
   2094 	&pxor	($ivec,$ivec);
   2095 &set_label("cbc_abort");
   2096 &function_end("${PREFIX}_cbc_encrypt");
   2097 
   2099 ######################################################################
   2100 # Mechanical port from aesni-x86_64.pl.
   2101 #
   2102 # _aesni_set_encrypt_key is private interface,
   2103 # input:
   2104 #	"eax"	const unsigned char *userKey
   2105 #	$rounds	int bits
   2106 #	$key	AES_KEY *key
   2107 # output:
   2108 #	"eax"	return code
   2109 #	$round	rounds
   2110 
   2111 &function_begin_B("_aesni_set_encrypt_key");
   2112 	&push	("ebp");
   2113 	&push	("ebx");
   2114 	&test	("eax","eax");
   2115 	&jz	(&label("bad_pointer"));
   2116 	&test	($key,$key);
   2117 	&jz	(&label("bad_pointer"));
   2118 
   2119 	&call	(&label("pic"));
   2120 &set_label("pic");
   2121 	&blindpop("ebx");
   2122 	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
   2123 
   2124 	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
   2125 	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
   2126 	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
   2127 	&mov	("ebp",&DWP(4,"ebp"));
   2128 	&lea	($key,&DWP(16,$key));
   2129 	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
   2130 	&cmp	($rounds,256);
   2131 	&je	(&label("14rounds"));
   2132 	&cmp	($rounds,192);
   2133 	&je	(&label("12rounds"));
   2134 	&cmp	($rounds,128);
   2135 	&jne	(&label("bad_keybits"));
   2136 
   2137 &set_label("10rounds",16);
   2138 	&cmp		("ebp",1<<28);
   2139 	&je		(&label("10rounds_alt"));
   2140 
   2141 	&mov		($rounds,9);
   2142 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
   2143 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
   2144 	&call		(&label("key_128_cold"));
   2145 	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
   2146 	&call		(&label("key_128"));
   2147 	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
   2148 	&call		(&label("key_128"));
   2149 	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
   2150 	&call		(&label("key_128"));
   2151 	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
   2152 	&call		(&label("key_128"));
   2153 	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
   2154 	&call		(&label("key_128"));
   2155 	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
   2156 	&call		(&label("key_128"));
   2157 	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
   2158 	&call		(&label("key_128"));
   2159 	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
   2160 	&call		(&label("key_128"));
   2161 	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
   2162 	&call		(&label("key_128"));
   2163 	&$movekey	(&QWP(0,$key),"xmm0");
   2164 	&mov		(&DWP(80,$key),$rounds);
   2165 
   2166 	&jmp	(&label("good_key"));
   2167 
   2168 &set_label("key_128",16);
   2169 	&$movekey	(&QWP(0,$key),"xmm0");
   2170 	&lea		($key,&DWP(16,$key));
   2171 &set_label("key_128_cold");
   2172 	&shufps		("xmm4","xmm0",0b00010000);
   2173 	&xorps		("xmm0","xmm4");
   2174 	&shufps		("xmm4","xmm0",0b10001100);
   2175 	&xorps		("xmm0","xmm4");
   2176 	&shufps		("xmm1","xmm1",0b11111111);	# critical path
   2177 	&xorps		("xmm0","xmm1");
   2178 	&ret();
   2179 
   2180 &set_label("10rounds_alt",16);
   2181 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
   2182 	&mov		($rounds,8);
   2183 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2184 	&movdqa		("xmm2","xmm0");
   2185 	&movdqu		(&QWP(-16,$key),"xmm0");
   2186 
   2187 &set_label("loop_key128");
   2188 	&pshufb		("xmm0","xmm5");
   2189 	&aesenclast	("xmm0","xmm4");
   2190 	&pslld		("xmm4",1);
   2191 	&lea		($key,&DWP(16,$key));
   2192 
   2193 	&movdqa		("xmm3","xmm2");
   2194 	&pslldq		("xmm2",4);
   2195 	&pxor		("xmm3","xmm2");
   2196 	&pslldq		("xmm2",4);
   2197 	&pxor		("xmm3","xmm2");
   2198 	&pslldq		("xmm2",4);
   2199 	&pxor		("xmm2","xmm3");
   2200 
   2201 	&pxor		("xmm0","xmm2");
   2202 	&movdqu		(&QWP(-16,$key),"xmm0");
   2203 	&movdqa		("xmm2","xmm0");
   2204 
   2205 	&dec		($rounds);
   2206 	&jnz		(&label("loop_key128"));
   2207 
   2208 	&movdqa		("xmm4",&QWP(0x30,"ebx"));
   2209 
   2210 	&pshufb		("xmm0","xmm5");
   2211 	&aesenclast	("xmm0","xmm4");
   2212 	&pslld		("xmm4",1);
   2213 
   2214 	&movdqa		("xmm3","xmm2");
   2215 	&pslldq		("xmm2",4);
   2216 	&pxor		("xmm3","xmm2");
   2217 	&pslldq		("xmm2",4);
   2218 	&pxor		("xmm3","xmm2");
   2219 	&pslldq		("xmm2",4);
   2220 	&pxor		("xmm2","xmm3");
   2221 
   2222 	&pxor		("xmm0","xmm2");
   2223 	&movdqu		(&QWP(0,$key),"xmm0");
   2224 
   2225 	&movdqa		("xmm2","xmm0");
   2226 	&pshufb		("xmm0","xmm5");
   2227 	&aesenclast	("xmm0","xmm4");
   2228 
   2229 	&movdqa		("xmm3","xmm2");
   2230 	&pslldq		("xmm2",4);
   2231 	&pxor		("xmm3","xmm2");
   2232 	&pslldq		("xmm2",4);
   2233 	&pxor		("xmm3","xmm2");
   2234 	&pslldq		("xmm2",4);
   2235 	&pxor		("xmm2","xmm3");
   2236 
   2237 	&pxor		("xmm0","xmm2");
   2238 	&movdqu		(&QWP(16,$key),"xmm0");
   2239 
   2240 	&mov		($rounds,9);
   2241 	&mov		(&DWP(96,$key),$rounds);
   2242 
   2243 	&jmp	(&label("good_key"));
   2244 
   2245 &set_label("12rounds",16);
   2246 	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
   2247 	&cmp		("ebp",1<<28);
   2248 	&je		(&label("12rounds_alt"));
   2249 
   2250 	&mov		($rounds,11);
   2251 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
   2252 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
   2253 	&call		(&label("key_192a_cold"));
   2254 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
   2255 	&call		(&label("key_192b"));
   2256 	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
   2257 	&call		(&label("key_192a"));
   2258 	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
   2259 	&call		(&label("key_192b"));
   2260 	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
   2261 	&call		(&label("key_192a"));
   2262 	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
   2263 	&call		(&label("key_192b"));
   2264 	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
   2265 	&call		(&label("key_192a"));
   2266 	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
   2267 	&call		(&label("key_192b"));
   2268 	&$movekey	(&QWP(0,$key),"xmm0");
   2269 	&mov		(&DWP(48,$key),$rounds);
   2270 
   2271 	&jmp	(&label("good_key"));
   2272 
   2273 &set_label("key_192a",16);
   2274 	&$movekey	(&QWP(0,$key),"xmm0");
   2275 	&lea		($key,&DWP(16,$key));
   2276 &set_label("key_192a_cold",16);
   2277 	&movaps		("xmm5","xmm2");
   2278 &set_label("key_192b_warm");
   2279 	&shufps		("xmm4","xmm0",0b00010000);
   2280 	&movdqa		("xmm3","xmm2");
   2281 	&xorps		("xmm0","xmm4");
   2282 	&shufps		("xmm4","xmm0",0b10001100);
   2283 	&pslldq		("xmm3",4);
   2284 	&xorps		("xmm0","xmm4");
   2285 	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
   2286 	&pxor		("xmm2","xmm3");
   2287 	&pxor		("xmm0","xmm1");
   2288 	&pshufd		("xmm3","xmm0",0b11111111);
   2289 	&pxor		("xmm2","xmm3");
   2290 	&ret();
   2291 
   2292 &set_label("key_192b",16);
   2293 	&movaps		("xmm3","xmm0");
   2294 	&shufps		("xmm5","xmm0",0b01000100);
   2295 	&$movekey	(&QWP(0,$key),"xmm5");
   2296 	&shufps		("xmm3","xmm2",0b01001110);
   2297 	&$movekey	(&QWP(16,$key),"xmm3");
   2298 	&lea		($key,&DWP(32,$key));
   2299 	&jmp		(&label("key_192b_warm"));
   2300 
   2301 &set_label("12rounds_alt",16);
   2302 	&movdqa		("xmm5",&QWP(0x10,"ebx"));
   2303 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2304 	&mov		($rounds,8);
   2305 	&movdqu		(&QWP(-16,$key),"xmm0");
   2306 
   2307 &set_label("loop_key192");
   2308 	&movq		(&QWP(0,$key),"xmm2");
   2309 	&movdqa		("xmm1","xmm2");
   2310 	&pshufb		("xmm2","xmm5");
   2311 	&aesenclast	("xmm2","xmm4");
   2312 	&pslld		("xmm4",1);
   2313 	&lea		($key,&DWP(24,$key));
   2314 
   2315 	&movdqa		("xmm3","xmm0");
   2316 	&pslldq		("xmm0",4);
   2317 	&pxor		("xmm3","xmm0");
   2318 	&pslldq		("xmm0",4);
   2319 	&pxor		("xmm3","xmm0");
   2320 	&pslldq		("xmm0",4);
   2321 	&pxor		("xmm0","xmm3");
   2322 
   2323 	&pshufd		("xmm3","xmm0",0xff);
   2324 	&pxor		("xmm3","xmm1");
   2325 	&pslldq		("xmm1",4);
   2326 	&pxor		("xmm3","xmm1");
   2327 
   2328 	&pxor		("xmm0","xmm2");
   2329 	&pxor		("xmm2","xmm3");
   2330 	&movdqu		(&QWP(-16,$key),"xmm0");
   2331 
   2332 	&dec		($rounds);
   2333 	&jnz		(&label("loop_key192"));
   2334 
   2335 	&mov	($rounds,11);
   2336 	&mov	(&DWP(32,$key),$rounds);
   2337 
   2338 	&jmp	(&label("good_key"));
   2339 
   2340 &set_label("14rounds",16);
   2341 	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
   2342 	&lea		($key,&DWP(16,$key));
   2343 	&cmp		("ebp",1<<28);
   2344 	&je		(&label("14rounds_alt"));
   2345 
   2346 	&mov		($rounds,13);
   2347 	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
   2348 	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
   2349 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
   2350 	&call		(&label("key_256a_cold"));
   2351 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
   2352 	&call		(&label("key_256b"));
   2353 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
   2354 	&call		(&label("key_256a"));
   2355 	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
   2356 	&call		(&label("key_256b"));
   2357 	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
   2358 	&call		(&label("key_256a"));
   2359 	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
   2360 	&call		(&label("key_256b"));
   2361 	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
   2362 	&call		(&label("key_256a"));
   2363 	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
   2364 	&call		(&label("key_256b"));
   2365 	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
   2366 	&call		(&label("key_256a"));
   2367 	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
   2368 	&call		(&label("key_256b"));
   2369 	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
   2370 	&call		(&label("key_256a"));
   2371 	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
   2372 	&call		(&label("key_256b"));
   2373 	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
   2374 	&call		(&label("key_256a"));
   2375 	&$movekey	(&QWP(0,$key),"xmm0");
   2376 	&mov		(&DWP(16,$key),$rounds);
   2377 	&xor		("eax","eax");
   2378 
   2379 	&jmp	(&label("good_key"));
   2380 
   2381 &set_label("key_256a",16);
   2382 	&$movekey	(&QWP(0,$key),"xmm2");
   2383 	&lea		($key,&DWP(16,$key));
   2384 &set_label("key_256a_cold");
   2385 	&shufps		("xmm4","xmm0",0b00010000);
   2386 	&xorps		("xmm0","xmm4");
   2387 	&shufps		("xmm4","xmm0",0b10001100);
   2388 	&xorps		("xmm0","xmm4");
   2389 	&shufps		("xmm1","xmm1",0b11111111);	# critical path
   2390 	&xorps		("xmm0","xmm1");
   2391 	&ret();
   2392 
   2393 &set_label("key_256b",16);
   2394 	&$movekey	(&QWP(0,$key),"xmm0");
   2395 	&lea		($key,&DWP(16,$key));
   2396 
   2397 	&shufps		("xmm4","xmm2",0b00010000);
   2398 	&xorps		("xmm2","xmm4");
   2399 	&shufps		("xmm4","xmm2",0b10001100);
   2400 	&xorps		("xmm2","xmm4");
   2401 	&shufps		("xmm1","xmm1",0b10101010);	# critical path
   2402 	&xorps		("xmm2","xmm1");
   2403 	&ret();
   2404 
   2405 &set_label("14rounds_alt",16);
   2406 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
   2407 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
   2408 	&mov		($rounds,7);
   2409 	&movdqu		(&QWP(-32,$key),"xmm0");
   2410 	&movdqa		("xmm1","xmm2");
   2411 	&movdqu		(&QWP(-16,$key),"xmm2");
   2412 
   2413 &set_label("loop_key256");
   2414 	&pshufb		("xmm2","xmm5");
   2415 	&aesenclast	("xmm2","xmm4");
   2416 
   2417 	&movdqa		("xmm3","xmm0");
   2418 	&pslldq		("xmm0",4);
   2419 	&pxor		("xmm3","xmm0");
   2420 	&pslldq		("xmm0",4);
   2421 	&pxor		("xmm3","xmm0");
   2422 	&pslldq		("xmm0",4);
   2423 	&pxor		("xmm0","xmm3");
   2424 	&pslld		("xmm4",1);
   2425 
   2426 	&pxor		("xmm0","xmm2");
   2427 	&movdqu		(&QWP(0,$key),"xmm0");
   2428 
   2429 	&dec		($rounds);
   2430 	&jz		(&label("done_key256"));
   2431 
   2432 	&pshufd		("xmm2","xmm0",0xff);
   2433 	&pxor		("xmm3","xmm3");
   2434 	&aesenclast	("xmm2","xmm3");
   2435 
   2436 	&movdqa		("xmm3","xmm1")
   2437 	&pslldq		("xmm1",4);
   2438 	&pxor		("xmm3","xmm1");
   2439 	&pslldq		("xmm1",4);
   2440 	&pxor		("xmm3","xmm1");
   2441 	&pslldq		("xmm1",4);
   2442 	&pxor		("xmm1","xmm3");
   2443 
   2444 	&pxor		("xmm2","xmm1");
   2445 	&movdqu		(&QWP(16,$key),"xmm2");
   2446 	&lea		($key,&DWP(32,$key));
   2447 	&movdqa		("xmm1","xmm2");
   2448 	&jmp		(&label("loop_key256"));
   2449 
   2450 &set_label("done_key256");
   2451 	&mov		($rounds,13);
   2452 	&mov		(&DWP(16,$key),$rounds);
   2453 
   2454 &set_label("good_key");
   2455 	&pxor	("xmm0","xmm0");
   2456 	&pxor	("xmm1","xmm1");
   2457 	&pxor	("xmm2","xmm2");
   2458 	&pxor	("xmm3","xmm3");
   2459 	&pxor	("xmm4","xmm4");
   2460 	&pxor	("xmm5","xmm5");
   2461 	&xor	("eax","eax");
   2462 	&pop	("ebx");
   2463 	&pop	("ebp");
   2464 	&ret	();
   2465 
   2466 &set_label("bad_pointer",4);
   2467 	&mov	("eax",-1);
   2468 	&pop	("ebx");
   2469 	&pop	("ebp");
   2470 	&ret	();
   2471 &set_label("bad_keybits",4);
   2472 	&pxor	("xmm0","xmm0");
   2473 	&mov	("eax",-2);
   2474 	&pop	("ebx");
   2475 	&pop	("ebp");
   2476 	&ret	();
   2477 &function_end_B("_aesni_set_encrypt_key");
   2478 
   2479 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
   2480 #                              AES_KEY *key)
   2481 &function_begin_B("${PREFIX}_set_encrypt_key");
   2482 	&mov	("eax",&wparam(0));
   2483 	&mov	($rounds,&wparam(1));
   2484 	&mov	($key,&wparam(2));
   2485 	&call	("_aesni_set_encrypt_key");
   2486 	&ret	();
   2487 &function_end_B("${PREFIX}_set_encrypt_key");
   2488 
   2489 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
   2490 #                              AES_KEY *key)
   2491 &function_begin_B("${PREFIX}_set_decrypt_key");
   2492 	&mov	("eax",&wparam(0));
   2493 	&mov	($rounds,&wparam(1));
   2494 	&mov	($key,&wparam(2));
   2495 	&call	("_aesni_set_encrypt_key");
   2496 	&mov	($key,&wparam(2));
   2497 	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
   2498 	&test	("eax","eax");
   2499 	&jnz	(&label("dec_key_ret"));
   2500 	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
   2501 
   2502 	&$movekey	("xmm0",&QWP(0,$key));	# just swap
   2503 	&$movekey	("xmm1",&QWP(0,"eax"));
   2504 	&$movekey	(&QWP(0,"eax"),"xmm0");
   2505 	&$movekey	(&QWP(0,$key),"xmm1");
   2506 	&lea		($key,&DWP(16,$key));
   2507 	&lea		("eax",&DWP(-16,"eax"));
   2508 
   2509 &set_label("dec_key_inverse");
   2510 	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
   2511 	&$movekey	("xmm1",&QWP(0,"eax"));
   2512 	&aesimc		("xmm0","xmm0");
   2513 	&aesimc		("xmm1","xmm1");
   2514 	&lea		($key,&DWP(16,$key));
   2515 	&lea		("eax",&DWP(-16,"eax"));
   2516 	&$movekey	(&QWP(16,"eax"),"xmm0");
   2517 	&$movekey	(&QWP(-16,$key),"xmm1");
   2518 	&cmp		("eax",$key);
   2519 	&ja		(&label("dec_key_inverse"));
   2520 
   2521 	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
   2522 	&aesimc		("xmm0","xmm0");
   2523 	&$movekey	(&QWP(0,$key),"xmm0");
   2524 
   2525 	&pxor		("xmm0","xmm0");
   2526 	&pxor		("xmm1","xmm1");
   2527 	&xor		("eax","eax");		# return success
   2528 &set_label("dec_key_ret");
   2529 	&ret	();
   2530 &function_end_B("${PREFIX}_set_decrypt_key");
   2531 
   2532 &set_label("key_const",64);
   2533 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
   2534 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
   2535 &data_word(1,1,1,1);
   2536 &data_word(0x1b,0x1b,0x1b,0x1b);
   2537 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
   2538 
   2539 &asm_finish();
   2540 
   2541 close STDOUT;
   2542