Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 ######################################################################
      4 ## Constant-time SSSE3 AES core implementation.
      5 ## version 0.1
      6 ##
      7 ## By Mike Hamburg (Stanford University), 2009
      8 ## Public domain.
      9 ##
     10 ## For details see http://shiftleft.org/papers/vector_aes/ and
     11 ## http://crypto.stanford.edu/vpaes/.
     12 
     13 ######################################################################
     14 # September 2011.
     15 #
     16 # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
     17 # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
     18 # doesn't handle partial vectors (doesn't have to if called from
     19 # EVP only). "Drop-in" implies that this module doesn't share key
     20 # schedule structure with the original nor does it make assumption
     21 # about its alignment...
     22 #
     23 # Performance summary. aes-586.pl column lists large-block CBC
     24 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
     25 # byte processed with 128-bit key, and vpaes-x86.pl column - [also
     26 # large-block CBC] encrypt/decrypt.
     27 #
     28 #		aes-586.pl		vpaes-x86.pl
     29 #
     30 # Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
     31 # Nehalem	27.9/40.4/18.1		10.2/11.9
     32 # Atom		70.7/92.1/60.1		61.1/75.4(***)
     33 # Silvermont	45.4/62.9/24.1		49.2/61.1(***)
     34 #
     35 # (*)	"Hyper-threading" in the context refers rather to cache shared
     36 #	among multiple cores, than to specifically Intel HTT. As vast
     37 #	majority of contemporary cores share cache, slower code path
     38 #	is common place. In other words "with-hyper-threading-off"
     39 #	results are presented mostly for reference purposes.
     40 #
     41 # (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
     42 #
     43 # (***)	Less impressive improvement on Core 2 and Atom is due to slow
     44 #	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
     45 #	and +15% on Atom (as implied, over "hyper-threading-safe"
     46 #	code path).
     47 #
     48 #						<appro (at] openssl.org>
     49 
     50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     51 push(@INC,"${dir}","${dir}../../../perlasm");
     52 require "x86asm.pl";
     53 
     54 $output = pop;
     55 open OUT,">$output";
     56 *STDOUT=*OUT;
     57 
     58 &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
     59 
     60 $PREFIX="vpaes";
     61 
     62 my  ($round, $base, $magic, $key, $const, $inp, $out)=
     63     ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
     64 
     65 &static_label("_vpaes_consts");
     66 &static_label("_vpaes_schedule_low_round");
     67 
     68 &set_label("_vpaes_consts",64);
     69 $k_inv=-0x30;		# inv, inva
     70 	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
     71 	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
     72 
     73 $k_s0F=-0x10;		# s0F
     74 	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
     75 
     76 $k_ipt=0x00;		# input transform (lo, hi)
     77 	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
     78 	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
     79 
     80 $k_sb1=0x20;		# sb1u, sb1t
     81 	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
     82 	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
     83 $k_sb2=0x40;		# sb2u, sb2t
     84 	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
     85 	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
     86 $k_sbo=0x60;		# sbou, sbot
     87 	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
     88 	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
     89 
     90 $k_mc_forward=0x80;	# mc_forward
     91 	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
     92 	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
     93 	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
     94 	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
     95 
     96 $k_mc_backward=0xc0;	# mc_backward
     97 	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
     98 	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
     99 	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
    100 	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
    101 
    102 $k_sr=0x100;		# sr
    103 	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
    104 	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
    105 	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
    106 	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
    107 
    108 $k_rcon=0x140;		# rcon
    109 	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
    110 
    111 $k_s63=0x150;		# s63: all equal to 0x63 transformed
    112 	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
    113 
    114 $k_opt=0x160;		# output transform
    115 	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
    116 	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
    117 
    118 $k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
    119 	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
    120 	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
    121 ##
    122 ##  Decryption stuff
    123 ##  Key schedule constants
    124 ##
    125 $k_dksd=0x1a0;		# decryption key schedule: invskew x*D
    126 	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
    127 	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
    128 $k_dksb=0x1c0;		# decryption key schedule: invskew x*B
    129 	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
    130 	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
    131 $k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
    132 	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
    133 	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
    134 $k_dks9=0x200;		# decryption key schedule: invskew x*9
    135 	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
    136 	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
    137 
    138 ##
    139 ##  Decryption stuff
    140 ##  Round function constants
    141 ##
    142 $k_dipt=0x220;		# decryption input transform
    143 	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
    144 	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
    145 
    146 $k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
    147 	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
    148 	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
    149 $k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
    150 	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
    151 	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
    152 $k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
    153 	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
    154 	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
    155 $k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
    156 	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
    157 	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
    158 $k_dsbo=0x2c0;		# decryption sbox final output
    159 	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
    160 	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
    161 &asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
    162 &align	(64);
    163 
    164 &function_begin_B("_vpaes_preheat");
    165 	&add	($const,&DWP(0,"esp"));
    166 	&movdqa	("xmm7",&QWP($k_inv,$const));
    167 	&movdqa	("xmm6",&QWP($k_s0F,$const));
    168 	&ret	();
    169 &function_end_B("_vpaes_preheat");
    170 
    171 ##
    172 ##  _aes_encrypt_core
    173 ##
    174 ##  AES-encrypt %xmm0.
    175 ##
    176 ##  Inputs:
    177 ##     %xmm0 = input
    178 ##     %xmm6-%xmm7 as in _vpaes_preheat
    179 ##    (%edx) = scheduled keys
    180 ##
    181 ##  Output in %xmm0
    182 ##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
    183 ##
    184 ##
    185 &function_begin_B("_vpaes_encrypt_core");
    186 	&mov	($magic,16);
    187 	&mov	($round,&DWP(240,$key));
    188 	&movdqa	("xmm1","xmm6")
    189 	&movdqa	("xmm2",&QWP($k_ipt,$const));
    190 	&pandn	("xmm1","xmm0");
    191 	&pand	("xmm0","xmm6");
    192 	&movdqu	("xmm5",&QWP(0,$key));
    193 	&pshufb	("xmm2","xmm0");
    194 	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
    195 	&pxor	("xmm2","xmm5");
    196 	&psrld	("xmm1",4);
    197 	&add	($key,16);
    198 	&pshufb	("xmm0","xmm1");
    199 	&lea	($base,&DWP($k_mc_backward,$const));
    200 	&pxor	("xmm0","xmm2");
    201 	&jmp	(&label("enc_entry"));
    202 
    203 
    204 &set_label("enc_loop",16);
    205 	# middle of middle round
    206 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
    207 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
    208 	&pshufb	("xmm4","xmm2");		# 4 = sb1u
    209 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    210 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
    211 	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
    212 	&pxor	("xmm0","xmm4");		# 0 = A
    213 	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
    214 	&pshufb	("xmm5","xmm2");		# 4 = sb2u
    215 	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
    216 	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
    217 	&pshufb	("xmm2","xmm3");		# 2 = sb2t
    218 	&movdqa	("xmm3","xmm0");		# 3 = A
    219 	&pxor	("xmm2","xmm5");		# 2 = 2A
    220 	&pshufb	("xmm0","xmm1");		# 0 = B
    221 	&add	($key,16);			# next key
    222 	&pxor	("xmm0","xmm2");		# 0 = 2A+B
    223 	&pshufb	("xmm3","xmm4");		# 3 = D
    224 	&add	($magic,16);			# next mc
    225 	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
    226 	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
    227 	&and	($magic,0x30);			# ... mod 4
    228 	&sub	($round,1);			# nr--
    229 	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
    230 
    231 &set_label("enc_entry");
    232 	# top of round
    233 	&movdqa	("xmm1","xmm6");		# 1 : i
    234 	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
    235 	&pandn	("xmm1","xmm0");		# 1 = i<<4
    236 	&psrld	("xmm1",4);			# 1 = i
    237 	&pand	("xmm0","xmm6");		# 0 = k
    238 	&pshufb	("xmm5","xmm0");		# 2 = a/k
    239 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
    240 	&pxor	("xmm0","xmm1");		# 0 = j
    241 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    242 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
    243 	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
    244 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    245 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
    246 	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
    247 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    248 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
    249 	&pxor	("xmm2","xmm0");		# 2 = io
    250 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    251 	&movdqu	("xmm5",&QWP(0,$key));
    252 	&pxor	("xmm3","xmm1");		# 3 = jo
    253 	&jnz	(&label("enc_loop"));
    254 
    255 	# middle of last round
    256 	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
    257 	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
    258 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    259 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
    260 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    261 	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
    262 	&pxor	("xmm0","xmm4");		# 0 = A
    263 	&pshufb	("xmm0","xmm1");
    264 	&ret	();
    265 &function_end_B("_vpaes_encrypt_core");
    266 
    267 ##
    268 ##  Decryption core
    269 ##
    270 ##  Same API as encryption core.
    271 ##
    272 &function_begin_B("_vpaes_decrypt_core");
    273 	&lea	($base,&DWP($k_dsbd,$const));
    274 	&mov	($round,&DWP(240,$key));
    275 	&movdqa	("xmm1","xmm6");
    276 	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
    277 	&pandn	("xmm1","xmm0");
    278 	&mov	($magic,$round);
    279 	&psrld	("xmm1",4)
    280 	&movdqu	("xmm5",&QWP(0,$key));
    281 	&shl	($magic,4);
    282 	&pand	("xmm0","xmm6");
    283 	&pshufb	("xmm2","xmm0");
    284 	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
    285 	&xor	($magic,0x30);
    286 	&pshufb	("xmm0","xmm1");
    287 	&and	($magic,0x30);
    288 	&pxor	("xmm2","xmm5");
    289 	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
    290 	&pxor	("xmm0","xmm2");
    291 	&add	($key,16);
    292 	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
    293 	&jmp	(&label("dec_entry"));
    294 
    295 &set_label("dec_loop",16);
    296 ##
    297 ##  Inverse mix columns
    298 ##
    299 	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
    300 	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
    301 	&pshufb	("xmm4","xmm2");		# 4 = sb9u
    302 	&pshufb	("xmm1","xmm3");		# 0 = sb9t
    303 	&pxor	("xmm0","xmm4");
    304 	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
    305 	&pxor	("xmm0","xmm1");		# 0 = ch
    306 	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt
    307 
    308 	&pshufb	("xmm4","xmm2");		# 4 = sbdu
    309 	&pshufb	("xmm0","xmm5");		# MC ch
    310 	&pshufb	("xmm1","xmm3");		# 0 = sbdt
    311 	&pxor	("xmm0","xmm4");		# 4 = ch
    312 	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
    313 	&pxor	("xmm0","xmm1");		# 0 = ch
    314 	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
    315 
    316 	&pshufb	("xmm4","xmm2");		# 4 = sbbu
    317 	&pshufb	("xmm0","xmm5");		# MC ch
    318 	&pshufb	("xmm1","xmm3");		# 0 = sbbt
    319 	&pxor	("xmm0","xmm4");		# 4 = ch
    320 	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
    321 	&pxor	("xmm0","xmm1");		# 0 = ch
    322 	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet
    323 
    324 	&pshufb	("xmm4","xmm2");		# 4 = sbeu
    325 	&pshufb	("xmm0","xmm5");		# MC ch
    326 	&pshufb	("xmm1","xmm3");		# 0 = sbet
    327 	&pxor	("xmm0","xmm4");		# 4 = ch
    328 	&add	($key,16);			# next round key
    329 	&palignr("xmm5","xmm5",12);
    330 	&pxor	("xmm0","xmm1");		# 0 = ch
    331 	&sub	($round,1);			# nr--
    332 
    333 &set_label("dec_entry");
    334 	# top of round
    335 	&movdqa	("xmm1","xmm6");		# 1 : i
    336 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
    337 	&pandn	("xmm1","xmm0");		# 1 = i<<4
    338 	&pand	("xmm0","xmm6");		# 0 = k
    339 	&psrld	("xmm1",4);			# 1 = i
    340 	&pshufb	("xmm2","xmm0");		# 2 = a/k
    341 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
    342 	&pxor	("xmm0","xmm1");		# 0 = j
    343 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    344 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
    345 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
    346 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    347 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
    348 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
    349 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    350 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
    351 	&pxor	("xmm2","xmm0");		# 2 = io
    352 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    353 	&movdqu	("xmm0",&QWP(0,$key));
    354 	&pxor	("xmm3","xmm1");		# 3 = jo
    355 	&jnz	(&label("dec_loop"));
    356 
    357 	# middle of last round
    358 	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
    359 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    360 	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
    361 	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
    362 	&movdqa	("xmm2",&QWP(0,$magic));
    363 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    364 	&pxor	("xmm0","xmm4");		# 0 = A
    365 	&pshufb	("xmm0","xmm2");
    366 	&ret	();
    367 &function_end_B("_vpaes_decrypt_core");
    368 
    369 ########################################################
    370 ##                                                    ##
    371 ##                  AES key schedule                  ##
    372 ##                                                    ##
    373 ########################################################
    374 &function_begin_B("_vpaes_schedule_core");
    375 	&add	($const,&DWP(0,"esp"));
    376 	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
    377 	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
    378 
    379 	# input transform
    380 	&movdqa	("xmm3","xmm0");
    381 	&lea	($base,&DWP($k_ipt,$const));
    382 	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
    383 	&call	("_vpaes_schedule_transform");
    384 	&movdqa	("xmm7","xmm0");
    385 
    386 	&test	($out,$out);
    387 	&jnz	(&label("schedule_am_decrypting"));
    388 
    389 	# encrypting, output zeroth round key after transform
    390 	&movdqu	(&QWP(0,$key),"xmm0");
    391 	&jmp	(&label("schedule_go"));
    392 
    393 &set_label("schedule_am_decrypting");
    394 	# decrypting, output zeroth round key after shiftrows
    395 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    396 	&pshufb	("xmm3","xmm1");
    397 	&movdqu	(&QWP(0,$key),"xmm3");
    398 	&xor	($magic,0x30);
    399 
    400 &set_label("schedule_go");
    401 	&cmp	($round,192);
    402 	&ja	(&label("schedule_256"));
    403 	&je	(&label("schedule_192"));
    404 	# 128: fall though
    405 
    406 ##
    407 ##  .schedule_128
    408 ##
    409 ##  128-bit specific part of key schedule.
    410 ##
    411 ##  This schedule is really simple, because all its parts
    412 ##  are accomplished by the subroutines.
    413 ##
    414 &set_label("schedule_128");
    415 	&mov	($round,10);
    416 
    417 &set_label("loop_schedule_128");
    418 	&call	("_vpaes_schedule_round");
    419 	&dec	($round);
    420 	&jz	(&label("schedule_mangle_last"));
    421 	&call	("_vpaes_schedule_mangle");	# write output
    422 	&jmp	(&label("loop_schedule_128"));
    423 
    424 ##
    425 ##  .aes_schedule_192
    426 ##
    427 ##  192-bit specific part of key schedule.
    428 ##
    429 ##  The main body of this schedule is the same as the 128-bit
    430 ##  schedule, but with more smearing.  The long, high side is
    431 ##  stored in %xmm7 as before, and the short, low side is in
    432 ##  the high bits of %xmm6.
    433 ##
    434 ##  This schedule is somewhat nastier, however, because each
    435 ##  round produces 192 bits of key material, or 1.5 round keys.
    436 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
    437 ##  keys.
    438 ##
    439 &set_label("schedule_192",16);
    440 	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
    441 	&call	("_vpaes_schedule_transform");	# input transform
    442 	&movdqa	("xmm6","xmm0");		# save short part
    443 	&pxor	("xmm4","xmm4");		# clear 4
    444 	&movhlps("xmm6","xmm4");		# clobber low side with zeros
    445 	&mov	($round,4);
    446 
    447 &set_label("loop_schedule_192");
    448 	&call	("_vpaes_schedule_round");
    449 	&palignr("xmm0","xmm6",8);
    450 	&call	("_vpaes_schedule_mangle");	# save key n
    451 	&call	("_vpaes_schedule_192_smear");
    452 	&call	("_vpaes_schedule_mangle");	# save key n+1
    453 	&call	("_vpaes_schedule_round");
    454 	&dec	($round);
    455 	&jz	(&label("schedule_mangle_last"));
    456 	&call	("_vpaes_schedule_mangle");	# save key n+2
    457 	&call	("_vpaes_schedule_192_smear");
    458 	&jmp	(&label("loop_schedule_192"));
    459 
    460 ##
    461 ##  .aes_schedule_256
    462 ##
    463 ##  256-bit specific part of key schedule.
    464 ##
    465 ##  The structure here is very similar to the 128-bit
    466 ##  schedule, but with an additional "low side" in
    467 ##  %xmm6.  The low side's rounds are the same as the
    468 ##  high side's, except no rcon and no rotation.
    469 ##
    470 &set_label("schedule_256",16);
    471 	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
    472 	&call	("_vpaes_schedule_transform");	# input transform
    473 	&mov	($round,7);
    474 
    475 &set_label("loop_schedule_256");
    476 	&call	("_vpaes_schedule_mangle");	# output low result
    477 	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
    478 
    479 	# high round
    480 	&call	("_vpaes_schedule_round");
    481 	&dec	($round);
    482 	&jz	(&label("schedule_mangle_last"));
    483 	&call	("_vpaes_schedule_mangle");
    484 
    485 	# low round. swap xmm7 and xmm6
    486 	&pshufd	("xmm0","xmm0",0xFF);
    487 	&movdqa	(&QWP(20,"esp"),"xmm7");
    488 	&movdqa	("xmm7","xmm6");
    489 	&call	("_vpaes_schedule_low_round");
    490 	&movdqa	("xmm7",&QWP(20,"esp"));
    491 
    492 	&jmp	(&label("loop_schedule_256"));
    493 
    494 ##
    495 ##  .aes_schedule_mangle_last
    496 ##
    497 ##  Mangler for last round of key schedule
    498 ##  Mangles %xmm0
    499 ##    when encrypting, outputs out(%xmm0) ^ 63
    500 ##    when decrypting, outputs unskew(%xmm0)
    501 ##
    502 ##  Always called right before return... jumps to cleanup and exits
    503 ##
    504 &set_label("schedule_mangle_last",16);
    505 	# schedule last round key from xmm0
    506 	&lea	($base,&DWP($k_deskew,$const));
    507 	&test	($out,$out);
    508 	&jnz	(&label("schedule_mangle_last_dec"));
    509 
    510 	# encrypting
    511 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    512 	&pshufb	("xmm0","xmm1");		# output permute
    513 	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
    514 	&add	($key,32);
    515 
    516 &set_label("schedule_mangle_last_dec");
    517 	&add	($key,-16);
    518 	&pxor	("xmm0",&QWP($k_s63,$const));
    519 	&call	("_vpaes_schedule_transform");	# output transform
    520 	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
    521 
    522 	# cleanup
    523 	&pxor	("xmm0","xmm0");
    524 	&pxor	("xmm1","xmm1");
    525 	&pxor	("xmm2","xmm2");
    526 	&pxor	("xmm3","xmm3");
    527 	&pxor	("xmm4","xmm4");
    528 	&pxor	("xmm5","xmm5");
    529 	&pxor	("xmm6","xmm6");
    530 	&pxor	("xmm7","xmm7");
    531 	&ret	();
    532 &function_end_B("_vpaes_schedule_core");
    533 
    534 ##
    535 ##  .aes_schedule_192_smear
    536 ##
    537 ##  Smear the short, low side in the 192-bit key schedule.
    538 ##
    539 ##  Inputs:
    540 ##    %xmm7: high side, b  a  x  y
    541 ##    %xmm6:  low side, d  c  0  0
    542 ##    %xmm13: 0
    543 ##
    544 ##  Outputs:
    545 ##    %xmm6: b+c+d  b+c  0  0
    546 ##    %xmm0: b+c+d  b+c  b  a
    547 ##
    548 &function_begin_B("_vpaes_schedule_192_smear");
    549 	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
    550 	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
    551 	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
    552 	&pxor	("xmm1","xmm1");
    553 	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
    554 	&movdqa	("xmm0","xmm6");
    555 	&movhlps("xmm6","xmm1");		# clobber low side with zeros
    556 	&ret	();
    557 &function_end_B("_vpaes_schedule_192_smear");
    558 
    559 ##
    560 ##  .aes_schedule_round
    561 ##
    562 ##  Runs one main round of the key schedule on %xmm0, %xmm7
    563 ##
    564 ##  Specifically, runs subbytes on the high dword of %xmm0
    565 ##  then rotates it by one byte and xors into the low dword of
    566 ##  %xmm7.
    567 ##
    568 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
    569 ##  next rcon.
    570 ##
    571 ##  Smears the dwords of %xmm7 by xoring the low into the
    572 ##  second low, result into third, result into highest.
    573 ##
    574 ##  Returns results in %xmm7 = %xmm0.
    575 ##  Clobbers %xmm1-%xmm5.
    576 ##
    577 &function_begin_B("_vpaes_schedule_round");
    578 	# extract rcon from xmm8
    579 	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
    580 	&pxor	("xmm1","xmm1");
    581 	&palignr("xmm1","xmm2",15);
    582 	&palignr("xmm2","xmm2",15);
    583 	&pxor	("xmm7","xmm1");
    584 
    585 	# rotate
    586 	&pshufd	("xmm0","xmm0",0xFF);
    587 	&palignr("xmm0","xmm0",1);
    588 
    589 	# fall through...
    590 	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
    591 
    592 	# low round: same as high round, but no rotation and no rcon.
    593 &set_label("_vpaes_schedule_low_round");
    594 	# smear xmm7
    595 	&movdqa	("xmm1","xmm7");
    596 	&pslldq	("xmm7",4);
    597 	&pxor	("xmm7","xmm1");
    598 	&movdqa	("xmm1","xmm7");
    599 	&pslldq	("xmm7",8);
    600 	&pxor	("xmm7","xmm1");
    601 	&pxor	("xmm7",&QWP($k_s63,$const));
    602 
    603 	# subbyte
    604 	&movdqa	("xmm4",&QWP($k_s0F,$const));
    605 	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
    606 	&movdqa	("xmm1","xmm4");
    607 	&pandn	("xmm1","xmm0");
    608 	&psrld	("xmm1",4);			# 1 = i
    609 	&pand	("xmm0","xmm4");		# 0 = k
    610 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
    611 	&pshufb	("xmm2","xmm0");		# 2 = a/k
    612 	&pxor	("xmm0","xmm1");		# 0 = j
    613 	&movdqa	("xmm3","xmm5");		# 3 : 1/i
    614 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    615 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
    616 	&movdqa	("xmm4","xmm5");		# 4 : 1/j
    617 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    618 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
    619 	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
    620 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    621 	&pxor	("xmm2","xmm0");		# 2 = io
    622 	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
    623 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    624 	&pxor	("xmm3","xmm1");		# 3 = jo
    625 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
    626 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    627 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
    628 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    629 	&pxor	("xmm0","xmm4");		# 0 = sbox output
    630 
    631 	# add in smeared stuff
    632 	&pxor	("xmm0","xmm7");
    633 	&movdqa	("xmm7","xmm0");
    634 	&ret	();
    635 &function_end_B("_vpaes_schedule_round");
    636 
    637 ##
    638 ##  .aes_schedule_transform
    639 ##
    640 ##  Linear-transform %xmm0 according to tables at (%ebx)
    641 ##
    642 ##  Output in %xmm0
    643 ##  Clobbers %xmm1, %xmm2
    644 ##
    645 &function_begin_B("_vpaes_schedule_transform");
    646 	&movdqa	("xmm2",&QWP($k_s0F,$const));
    647 	&movdqa	("xmm1","xmm2");
    648 	&pandn	("xmm1","xmm0");
    649 	&psrld	("xmm1",4);
    650 	&pand	("xmm0","xmm2");
    651 	&movdqa	("xmm2",&QWP(0,$base));
    652 	&pshufb	("xmm2","xmm0");
    653 	&movdqa	("xmm0",&QWP(16,$base));
    654 	&pshufb	("xmm0","xmm1");
    655 	&pxor	("xmm0","xmm2");
    656 	&ret	();
    657 &function_end_B("_vpaes_schedule_transform");
    658 
    659 ##
    660 ##  .aes_schedule_mangle
    661 ##
    662 ##  Mangle xmm0 from (basis-transformed) standard version
    663 ##  to our version.
    664 ##
    665 ##  On encrypt,
    666 ##    xor with 0x63
    667 ##    multiply by circulant 0,1,1,1
    668 ##    apply shiftrows transform
    669 ##
    670 ##  On decrypt,
    671 ##    xor with 0x63
    672 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
    673 ##    deskew
    674 ##    apply shiftrows transform
    675 ##
    676 ##
    677 ##  Writes out to (%edx), and increments or decrements it
    678 ##  Keeps track of round number mod 4 in %ecx
    679 ##  Preserves xmm0
    680 ##  Clobbers xmm1-xmm5
    681 ##
    682 &function_begin_B("_vpaes_schedule_mangle");
    683 	&movdqa	("xmm4","xmm0");	# save xmm0 for later
    684 	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
    685 	&test	($out,$out);
    686 	&jnz	(&label("schedule_mangle_dec"));
    687 
    688 	# encrypting
    689 	&add	($key,16);
    690 	&pxor	("xmm4",&QWP($k_s63,$const));
    691 	&pshufb	("xmm4","xmm5");
    692 	&movdqa	("xmm3","xmm4");
    693 	&pshufb	("xmm4","xmm5");
    694 	&pxor	("xmm3","xmm4");
    695 	&pshufb	("xmm4","xmm5");
    696 	&pxor	("xmm3","xmm4");
    697 
    698 	&jmp	(&label("schedule_mangle_both"));
    699 
    700 &set_label("schedule_mangle_dec",16);
    701 	# inverse mix columns
    702 	&movdqa	("xmm2",&QWP($k_s0F,$const));
    703 	&lea	($inp,&DWP($k_dksd,$const));
    704 	&movdqa	("xmm1","xmm2");
    705 	&pandn	("xmm1","xmm4");
    706 	&psrld	("xmm1",4);			# 1 = hi
    707 	&pand	("xmm4","xmm2");		# 4 = lo
    708 
    709 	&movdqa	("xmm2",&QWP(0,$inp));
    710 	&pshufb	("xmm2","xmm4");
    711 	&movdqa	("xmm3",&QWP(0x10,$inp));
    712 	&pshufb	("xmm3","xmm1");
    713 	&pxor	("xmm3","xmm2");
    714 	&pshufb	("xmm3","xmm5");
    715 
    716 	&movdqa	("xmm2",&QWP(0x20,$inp));
    717 	&pshufb	("xmm2","xmm4");
    718 	&pxor	("xmm2","xmm3");
    719 	&movdqa	("xmm3",&QWP(0x30,$inp));
    720 	&pshufb	("xmm3","xmm1");
    721 	&pxor	("xmm3","xmm2");
    722 	&pshufb	("xmm3","xmm5");
    723 
    724 	&movdqa	("xmm2",&QWP(0x40,$inp));
    725 	&pshufb	("xmm2","xmm4");
    726 	&pxor	("xmm2","xmm3");
    727 	&movdqa	("xmm3",&QWP(0x50,$inp));
    728 	&pshufb	("xmm3","xmm1");
    729 	&pxor	("xmm3","xmm2");
    730 	&pshufb	("xmm3","xmm5");
    731 
    732 	&movdqa	("xmm2",&QWP(0x60,$inp));
    733 	&pshufb	("xmm2","xmm4");
    734 	&pxor	("xmm2","xmm3");
    735 	&movdqa	("xmm3",&QWP(0x70,$inp));
    736 	&pshufb	("xmm3","xmm1");
    737 	&pxor	("xmm3","xmm2");
    738 
    739 	&add	($key,-16);
    740 
    741 &set_label("schedule_mangle_both");
    742 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    743 	&pshufb	("xmm3","xmm1");
    744 	&add	($magic,-16);
    745 	&and	($magic,0x30);
    746 	&movdqu	(&QWP(0,$key),"xmm3");
    747 	&ret	();
    748 &function_end_B("_vpaes_schedule_mangle");
    749 
    750 #
    751 # Interface to OpenSSL
    752 #
    753 &function_begin("${PREFIX}_set_encrypt_key");
    754 	&mov	($inp,&wparam(0));		# inp
    755 	&lea	($base,&DWP(-56,"esp"));
    756 	&mov	($round,&wparam(1));		# bits
    757 	&and	($base,-16);
    758 	&mov	($key,&wparam(2));		# key
    759 	&xchg	($base,"esp");			# alloca
    760 	&mov	(&DWP(48,"esp"),$base);
    761 
    762 	&mov	($base,$round);
    763 	&shr	($base,5);
    764 	&add	($base,5);
    765 	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
    766 	&mov	($magic,0x30);
    767 	&mov	($out,0);
    768 
    769 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    770 	&call	("_vpaes_schedule_core");
    771 &set_label("pic_point");
    772 
    773 	&mov	("esp",&DWP(48,"esp"));
    774 	&xor	("eax","eax");
    775 &function_end("${PREFIX}_set_encrypt_key");
    776 
    777 &function_begin("${PREFIX}_set_decrypt_key");
    778 	&mov	($inp,&wparam(0));		# inp
    779 	&lea	($base,&DWP(-56,"esp"));
    780 	&mov	($round,&wparam(1));		# bits
    781 	&and	($base,-16);
    782 	&mov	($key,&wparam(2));		# key
    783 	&xchg	($base,"esp");			# alloca
    784 	&mov	(&DWP(48,"esp"),$base);
    785 
    786 	&mov	($base,$round);
    787 	&shr	($base,5);
    788 	&add	($base,5);
    789 	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
    790 	&shl	($base,4);
    791 	&lea	($key,&DWP(16,$key,$base));
    792 
    793 	&mov	($out,1);
    794 	&mov	($magic,$round);
    795 	&shr	($magic,1);
    796 	&and	($magic,32);
    797 	&xor	($magic,32);			# nbist==192?0:32;
    798 
    799 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    800 	&call	("_vpaes_schedule_core");
    801 &set_label("pic_point");
    802 
    803 	&mov	("esp",&DWP(48,"esp"));
    804 	&xor	("eax","eax");
    805 &function_end("${PREFIX}_set_decrypt_key");
    806 
    807 &function_begin("${PREFIX}_encrypt");
    808 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    809 	&call	("_vpaes_preheat");
    810 &set_label("pic_point");
    811 	&mov	($inp,&wparam(0));		# inp
    812 	&lea	($base,&DWP(-56,"esp"));
    813 	&mov	($out,&wparam(1));		# out
    814 	&and	($base,-16);
    815 	&mov	($key,&wparam(2));		# key
    816 	&xchg	($base,"esp");			# alloca
    817 	&mov	(&DWP(48,"esp"),$base);
    818 
    819 	&movdqu	("xmm0",&QWP(0,$inp));
    820 	&call	("_vpaes_encrypt_core");
    821 	&movdqu	(&QWP(0,$out),"xmm0");
    822 
    823 	&mov	("esp",&DWP(48,"esp"));
    824 &function_end("${PREFIX}_encrypt");
    825 
    826 &function_begin("${PREFIX}_decrypt");
    827 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    828 	&call	("_vpaes_preheat");
    829 &set_label("pic_point");
    830 	&mov	($inp,&wparam(0));		# inp
    831 	&lea	($base,&DWP(-56,"esp"));
    832 	&mov	($out,&wparam(1));		# out
    833 	&and	($base,-16);
    834 	&mov	($key,&wparam(2));		# key
    835 	&xchg	($base,"esp");			# alloca
    836 	&mov	(&DWP(48,"esp"),$base);
    837 
    838 	&movdqu	("xmm0",&QWP(0,$inp));
    839 	&call	("_vpaes_decrypt_core");
    840 	&movdqu	(&QWP(0,$out),"xmm0");
    841 
    842 	&mov	("esp",&DWP(48,"esp"));
    843 &function_end("${PREFIX}_decrypt");
    844 
    845 &function_begin("${PREFIX}_cbc_encrypt");
    846 	&mov	($inp,&wparam(0));		# inp
    847 	&mov	($out,&wparam(1));		# out
    848 	&mov	($round,&wparam(2));		# len
    849 	&mov	($key,&wparam(3));		# key
    850 	&sub	($round,16);
    851 	&jc	(&label("cbc_abort"));
    852 	&lea	($base,&DWP(-56,"esp"));
    853 	&mov	($const,&wparam(4));		# ivp
    854 	&and	($base,-16);
    855 	&mov	($magic,&wparam(5));		# enc
    856 	&xchg	($base,"esp");			# alloca
    857 	&movdqu	("xmm1",&QWP(0,$const));	# load IV
    858 	&sub	($out,$inp);
    859 	&mov	(&DWP(48,"esp"),$base);
    860 
    861 	&mov	(&DWP(0,"esp"),$out);		# save out
    862 	&mov	(&DWP(4,"esp"),$key)		# save key
    863 	&mov	(&DWP(8,"esp"),$const);		# save ivp
    864 	&mov	($out,$round);			# $out works as $len
    865 
    866 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    867 	&call	("_vpaes_preheat");
    868 &set_label("pic_point");
    869 	&cmp	($magic,0);
    870 	&je	(&label("cbc_dec_loop"));
    871 	&jmp	(&label("cbc_enc_loop"));
    872 
    873 &set_label("cbc_enc_loop",16);
    874 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
    875 	&pxor	("xmm0","xmm1");		# inp^=iv
    876 	&call	("_vpaes_encrypt_core");
    877 	&mov	($base,&DWP(0,"esp"));		# restore out
    878 	&mov	($key,&DWP(4,"esp"));		# restore key
    879 	&movdqa	("xmm1","xmm0");
    880 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
    881 	&lea	($inp,&DWP(16,$inp));
    882 	&sub	($out,16);
    883 	&jnc	(&label("cbc_enc_loop"));
    884 	&jmp	(&label("cbc_done"));
    885 
    886 &set_label("cbc_dec_loop",16);
    887 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
    888 	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
    889 	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
    890 	&call	("_vpaes_decrypt_core");
    891 	&mov	($base,&DWP(0,"esp"));		# restore out
    892 	&mov	($key,&DWP(4,"esp"));		# restore key
    893 	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
    894 	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
    895 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
    896 	&lea	($inp,&DWP(16,$inp));
    897 	&sub	($out,16);
    898 	&jnc	(&label("cbc_dec_loop"));
    899 
    900 &set_label("cbc_done");
    901 	&mov	($base,&DWP(8,"esp"));		# restore ivp
    902 	&mov	("esp",&DWP(48,"esp"));
    903 	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
    904 &set_label("cbc_abort");
    905 &function_end("${PREFIX}_cbc_encrypt");
    906 
    907 &asm_finish();
    908 
    909 close STDOUT;
    910