Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 ######################################################################
      4 ## Constant-time SSSE3 AES core implementation.
      5 ## version 0.1
      6 ##
      7 ## By Mike Hamburg (Stanford University), 2009
      8 ## Public domain.
      9 ##
     10 ## For details see http://shiftleft.org/papers/vector_aes/ and
     11 ## http://crypto.stanford.edu/vpaes/.
     12 
     13 ######################################################################
     14 # September 2011.
     15 #
     16 # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
     17 # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
     18 # doesn't handle partial vectors (doesn't have to if called from
     19 # EVP only). "Drop-in" implies that this module doesn't share key
     20 # schedule structure with the original nor does it make assumption
     21 # about its alignment...
     22 #
     23 # Performance summary. aes-586.pl column lists large-block CBC
     24 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
     25 # byte processed with 128-bit key, and vpaes-x86.pl column - [also
     26 # large-block CBC] encrypt/decrypt.
     27 #
     28 #		aes-586.pl		vpaes-x86.pl
     29 #
     30 # Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
     31 # Nehalem	27.9/40.4/18.1		10.2/11.9
     32 # Atom		70.7/92.1/60.1		61.1/75.4(***)
     33 # Silvermont	45.4/62.9/24.1		49.2/61.1(***)
     34 #
     35 # (*)	"Hyper-threading" in the context refers rather to cache shared
     36 #	among multiple cores, than to specifically Intel HTT. As vast
     37 #	majority of contemporary cores share cache, slower code path
     38 #	is common place. In other words "with-hyper-threading-off"
     39 #	results are presented mostly for reference purposes.
     40 #
     41 # (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
     42 #
     43 # (***)	Less impressive improvement on Core 2 and Atom is due to slow
     44 #	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
     45 #	and +15% on Atom (as implied, over "hyper-threading-safe"
     46 #	code path).
     47 #
     48 #						<appro (at] openssl.org>
     49 
     50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     51 push(@INC,"${dir}","${dir}../../perlasm");
     52 require "x86asm.pl";
     53 
     54 &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
     55 
     56 $PREFIX="vpaes";
     57 
     58 my  ($round, $base, $magic, $key, $const, $inp, $out)=
     59     ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
     60 
     61 &static_label("_vpaes_consts");
     62 &static_label("_vpaes_schedule_low_round");
     63 
     64 &set_label("_vpaes_consts",64);
     65 $k_inv=-0x30;		# inv, inva
     66 	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
     67 	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
     68 
     69 $k_s0F=-0x10;		# s0F
     70 	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
     71 
     72 $k_ipt=0x00;		# input transform (lo, hi)
     73 	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
     74 	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
     75 
     76 $k_sb1=0x20;		# sb1u, sb1t
     77 	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
     78 	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
     79 $k_sb2=0x40;		# sb2u, sb2t
     80 	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
     81 	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
     82 $k_sbo=0x60;		# sbou, sbot
     83 	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
     84 	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
     85 
     86 $k_mc_forward=0x80;	# mc_forward
     87 	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
     88 	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
     89 	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
     90 	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
     91 
     92 $k_mc_backward=0xc0;	# mc_backward
     93 	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
     94 	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
     95 	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
     96 	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
     97 
     98 $k_sr=0x100;		# sr
     99 	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
    100 	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
    101 	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
    102 	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
    103 
    104 $k_rcon=0x140;		# rcon
    105 	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
    106 
    107 $k_s63=0x150;		# s63: all equal to 0x63 transformed
    108 	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
    109 
    110 $k_opt=0x160;		# output transform
    111 	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
    112 	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
    113 
    114 $k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
    115 	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
    116 	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
    117 ##
    118 ##  Decryption stuff
    119 ##  Key schedule constants
    120 ##
    121 $k_dksd=0x1a0;		# decryption key schedule: invskew x*D
    122 	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
    123 	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
    124 $k_dksb=0x1c0;		# decryption key schedule: invskew x*B
    125 	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
    126 	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
    127 $k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
    128 	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
    129 	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
    130 $k_dks9=0x200;		# decryption key schedule: invskew x*9
    131 	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
    132 	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
    133 
    134 ##
    135 ##  Decryption stuff
    136 ##  Round function constants
    137 ##
    138 $k_dipt=0x220;		# decryption input transform
    139 	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
    140 	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
    141 
    142 $k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
    143 	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
    144 	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
    145 $k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
    146 	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
    147 	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
    148 $k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
    149 	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
    150 	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
    151 $k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
    152 	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
    153 	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
    154 $k_dsbo=0x2c0;		# decryption sbox final output
    155 	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
    156 	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
    157 &asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
    158 &align	(64);
    159 
    160 &function_begin_B("_vpaes_preheat");
    161 	&add	($const,&DWP(0,"esp"));
    162 	&movdqa	("xmm7",&QWP($k_inv,$const));
    163 	&movdqa	("xmm6",&QWP($k_s0F,$const));
    164 	&ret	();
    165 &function_end_B("_vpaes_preheat");
    166 
    167 ##
    168 ##  _aes_encrypt_core
    169 ##
    170 ##  AES-encrypt %xmm0.
    171 ##
    172 ##  Inputs:
    173 ##     %xmm0 = input
    174 ##     %xmm6-%xmm7 as in _vpaes_preheat
    175 ##    (%edx) = scheduled keys
    176 ##
    177 ##  Output in %xmm0
    178 ##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
    179 ##
    180 ##
    181 &function_begin_B("_vpaes_encrypt_core");
    182 	&mov	($magic,16);
    183 	&mov	($round,&DWP(240,$key));
    184 	&movdqa	("xmm1","xmm6")
    185 	&movdqa	("xmm2",&QWP($k_ipt,$const));
    186 	&pandn	("xmm1","xmm0");
    187 	&pand	("xmm0","xmm6");
    188 	&movdqu	("xmm5",&QWP(0,$key));
    189 	&pshufb	("xmm2","xmm0");
    190 	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
    191 	&pxor	("xmm2","xmm5");
    192 	&psrld	("xmm1",4);
    193 	&add	($key,16);
    194 	&pshufb	("xmm0","xmm1");
    195 	&lea	($base,&DWP($k_mc_backward,$const));
    196 	&pxor	("xmm0","xmm2");
    197 	&jmp	(&label("enc_entry"));
    198 
    199 
    200 &set_label("enc_loop",16);
    201 	# middle of middle round
    202 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
    203 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
    204 	&pshufb	("xmm4","xmm2");		# 4 = sb1u
    205 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    206 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
    207 	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
    208 	&pxor	("xmm0","xmm4");		# 0 = A
    209 	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
    210 	&pshufb	("xmm5","xmm2");		# 4 = sb2u
    211 	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
    212 	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
    213 	&pshufb	("xmm2","xmm3");		# 2 = sb2t
    214 	&movdqa	("xmm3","xmm0");		# 3 = A
    215 	&pxor	("xmm2","xmm5");		# 2 = 2A
    216 	&pshufb	("xmm0","xmm1");		# 0 = B
    217 	&add	($key,16);			# next key
    218 	&pxor	("xmm0","xmm2");		# 0 = 2A+B
    219 	&pshufb	("xmm3","xmm4");		# 3 = D
    220 	&add	($magic,16);			# next mc
    221 	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
    222 	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
    223 	&and	($magic,0x30);			# ... mod 4
    224 	&sub	($round,1);			# nr--
    225 	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
    226 
    227 &set_label("enc_entry");
    228 	# top of round
    229 	&movdqa	("xmm1","xmm6");		# 1 : i
    230 	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
    231 	&pandn	("xmm1","xmm0");		# 1 = i<<4
    232 	&psrld	("xmm1",4);			# 1 = i
    233 	&pand	("xmm0","xmm6");		# 0 = k
    234 	&pshufb	("xmm5","xmm0");		# 2 = a/k
    235 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
    236 	&pxor	("xmm0","xmm1");		# 0 = j
    237 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    238 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
    239 	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
    240 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    241 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
    242 	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
    243 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    244 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
    245 	&pxor	("xmm2","xmm0");		# 2 = io
    246 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    247 	&movdqu	("xmm5",&QWP(0,$key));
    248 	&pxor	("xmm3","xmm1");		# 3 = jo
    249 	&jnz	(&label("enc_loop"));
    250 
    251 	# middle of last round
    252 	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
    253 	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
    254 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    255 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
    256 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    257 	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
    258 	&pxor	("xmm0","xmm4");		# 0 = A
    259 	&pshufb	("xmm0","xmm1");
    260 	&ret	();
    261 &function_end_B("_vpaes_encrypt_core");
    262 
    263 ##
    264 ##  Decryption core
    265 ##
    266 ##  Same API as encryption core.
    267 ##
    268 &function_begin_B("_vpaes_decrypt_core");
    269 	&lea	($base,&DWP($k_dsbd,$const));
    270 	&mov	($round,&DWP(240,$key));
    271 	&movdqa	("xmm1","xmm6");
    272 	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
    273 	&pandn	("xmm1","xmm0");
    274 	&mov	($magic,$round);
    275 	&psrld	("xmm1",4)
    276 	&movdqu	("xmm5",&QWP(0,$key));
    277 	&shl	($magic,4);
    278 	&pand	("xmm0","xmm6");
    279 	&pshufb	("xmm2","xmm0");
    280 	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
    281 	&xor	($magic,0x30);
    282 	&pshufb	("xmm0","xmm1");
    283 	&and	($magic,0x30);
    284 	&pxor	("xmm2","xmm5");
    285 	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
    286 	&pxor	("xmm0","xmm2");
    287 	&add	($key,16);
    288 	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
    289 	&jmp	(&label("dec_entry"));
    290 
    291 &set_label("dec_loop",16);
    292 ##
    293 ##  Inverse mix columns
    294 ##
    295 	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
    296 	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
    297 	&pshufb	("xmm4","xmm2");		# 4 = sb9u
    298 	&pshufb	("xmm1","xmm3");		# 0 = sb9t
    299 	&pxor	("xmm0","xmm4");
    300 	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
    301 	&pxor	("xmm0","xmm1");		# 0 = ch
    302 	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt
    303 
    304 	&pshufb	("xmm4","xmm2");		# 4 = sbdu
    305 	&pshufb	("xmm0","xmm5");		# MC ch
    306 	&pshufb	("xmm1","xmm3");		# 0 = sbdt
    307 	&pxor	("xmm0","xmm4");		# 4 = ch
    308 	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
    309 	&pxor	("xmm0","xmm1");		# 0 = ch
    310 	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
    311 
    312 	&pshufb	("xmm4","xmm2");		# 4 = sbbu
    313 	&pshufb	("xmm0","xmm5");		# MC ch
    314 	&pshufb	("xmm1","xmm3");		# 0 = sbbt
    315 	&pxor	("xmm0","xmm4");		# 4 = ch
    316 	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
    317 	&pxor	("xmm0","xmm1");		# 0 = ch
    318 	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet
    319 
    320 	&pshufb	("xmm4","xmm2");		# 4 = sbeu
    321 	&pshufb	("xmm0","xmm5");		# MC ch
    322 	&pshufb	("xmm1","xmm3");		# 0 = sbet
    323 	&pxor	("xmm0","xmm4");		# 4 = ch
    324 	&add	($key,16);			# next round key
    325 	&palignr("xmm5","xmm5",12);
    326 	&pxor	("xmm0","xmm1");		# 0 = ch
    327 	&sub	($round,1);			# nr--
    328 
    329 &set_label("dec_entry");
    330 	# top of round
    331 	&movdqa	("xmm1","xmm6");		# 1 : i
    332 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
    333 	&pandn	("xmm1","xmm0");		# 1 = i<<4
    334 	&pand	("xmm0","xmm6");		# 0 = k
    335 	&psrld	("xmm1",4);			# 1 = i
    336 	&pshufb	("xmm2","xmm0");		# 2 = a/k
    337 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
    338 	&pxor	("xmm0","xmm1");		# 0 = j
    339 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    340 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
    341 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
    342 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    343 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
    344 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
    345 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    346 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
    347 	&pxor	("xmm2","xmm0");		# 2 = io
    348 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    349 	&movdqu	("xmm0",&QWP(0,$key));
    350 	&pxor	("xmm3","xmm1");		# 3 = jo
    351 	&jnz	(&label("dec_loop"));
    352 
    353 	# middle of last round
    354 	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
    355 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    356 	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
    357 	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
    358 	&movdqa	("xmm2",&QWP(0,$magic));
    359 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    360 	&pxor	("xmm0","xmm4");		# 0 = A
    361 	&pshufb	("xmm0","xmm2");
    362 	&ret	();
    363 &function_end_B("_vpaes_decrypt_core");
    364 
    365 ########################################################
    366 ##                                                    ##
    367 ##                  AES key schedule                  ##
    368 ##                                                    ##
    369 ########################################################
    370 &function_begin_B("_vpaes_schedule_core");
    371 	&add	($const,&DWP(0,"esp"));
    372 	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
    373 	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
    374 
    375 	# input transform
    376 	&movdqa	("xmm3","xmm0");
    377 	&lea	($base,&DWP($k_ipt,$const));
    378 	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
    379 	&call	("_vpaes_schedule_transform");
    380 	&movdqa	("xmm7","xmm0");
    381 
    382 	&test	($out,$out);
    383 	&jnz	(&label("schedule_am_decrypting"));
    384 
    385 	# encrypting, output zeroth round key after transform
    386 	&movdqu	(&QWP(0,$key),"xmm0");
    387 	&jmp	(&label("schedule_go"));
    388 
    389 &set_label("schedule_am_decrypting");
    390 	# decrypting, output zeroth round key after shiftrows
    391 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    392 	&pshufb	("xmm3","xmm1");
    393 	&movdqu	(&QWP(0,$key),"xmm3");
    394 	&xor	($magic,0x30);
    395 
    396 &set_label("schedule_go");
    397 	&cmp	($round,192);
    398 	&ja	(&label("schedule_256"));
    399 	&je	(&label("schedule_192"));
    400 	# 128: fall though
    401 
    402 ##
    403 ##  .schedule_128
    404 ##
    405 ##  128-bit specific part of key schedule.
    406 ##
    407 ##  This schedule is really simple, because all its parts
    408 ##  are accomplished by the subroutines.
    409 ##
    410 &set_label("schedule_128");
    411 	&mov	($round,10);
    412 
    413 &set_label("loop_schedule_128");
    414 	&call	("_vpaes_schedule_round");
    415 	&dec	($round);
    416 	&jz	(&label("schedule_mangle_last"));
    417 	&call	("_vpaes_schedule_mangle");	# write output
    418 	&jmp	(&label("loop_schedule_128"));
    419 
    420 ##
    421 ##  .aes_schedule_192
    422 ##
    423 ##  192-bit specific part of key schedule.
    424 ##
    425 ##  The main body of this schedule is the same as the 128-bit
    426 ##  schedule, but with more smearing.  The long, high side is
    427 ##  stored in %xmm7 as before, and the short, low side is in
    428 ##  the high bits of %xmm6.
    429 ##
    430 ##  This schedule is somewhat nastier, however, because each
    431 ##  round produces 192 bits of key material, or 1.5 round keys.
    432 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
    433 ##  keys.
    434 ##
    435 &set_label("schedule_192",16);
    436 	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
    437 	&call	("_vpaes_schedule_transform");	# input transform	
    438 	&movdqa	("xmm6","xmm0");		# save short part
    439 	&pxor	("xmm4","xmm4");		# clear 4
    440 	&movhlps("xmm6","xmm4");		# clobber low side with zeros
    441 	&mov	($round,4);
    442 
    443 &set_label("loop_schedule_192");
    444 	&call	("_vpaes_schedule_round");
    445 	&palignr("xmm0","xmm6",8);
    446 	&call	("_vpaes_schedule_mangle");	# save key n
    447 	&call	("_vpaes_schedule_192_smear");
    448 	&call	("_vpaes_schedule_mangle");	# save key n+1
    449 	&call	("_vpaes_schedule_round");
    450 	&dec	($round);
    451 	&jz	(&label("schedule_mangle_last"));
    452 	&call	("_vpaes_schedule_mangle");	# save key n+2
    453 	&call	("_vpaes_schedule_192_smear");
    454 	&jmp	(&label("loop_schedule_192"));
    455 
    456 ##
    457 ##  .aes_schedule_256
    458 ##
    459 ##  256-bit specific part of key schedule.
    460 ##
    461 ##  The structure here is very similar to the 128-bit
    462 ##  schedule, but with an additional "low side" in
    463 ##  %xmm6.  The low side's rounds are the same as the
    464 ##  high side's, except no rcon and no rotation.
    465 ##
    466 &set_label("schedule_256",16);
    467 	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
    468 	&call	("_vpaes_schedule_transform");	# input transform	
    469 	&mov	($round,7);
    470 
    471 &set_label("loop_schedule_256");
    472 	&call	("_vpaes_schedule_mangle");	# output low result
    473 	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
    474 
    475 	# high round
    476 	&call	("_vpaes_schedule_round");
    477 	&dec	($round);
    478 	&jz	(&label("schedule_mangle_last"));
    479 	&call	("_vpaes_schedule_mangle");	
    480 
    481 	# low round. swap xmm7 and xmm6
    482 	&pshufd	("xmm0","xmm0",0xFF);
    483 	&movdqa	(&QWP(20,"esp"),"xmm7");
    484 	&movdqa	("xmm7","xmm6");
    485 	&call	("_vpaes_schedule_low_round");
    486 	&movdqa	("xmm7",&QWP(20,"esp"));
    487 
    488 	&jmp	(&label("loop_schedule_256"));
    489 
    490 ##
    491 ##  .aes_schedule_mangle_last
    492 ##
    493 ##  Mangler for last round of key schedule
    494 ##  Mangles %xmm0
    495 ##    when encrypting, outputs out(%xmm0) ^ 63
    496 ##    when decrypting, outputs unskew(%xmm0)
    497 ##
    498 ##  Always called right before return... jumps to cleanup and exits
    499 ##
    500 &set_label("schedule_mangle_last",16);
    501 	# schedule last round key from xmm0
    502 	&lea	($base,&DWP($k_deskew,$const));
    503 	&test	($out,$out);
    504 	&jnz	(&label("schedule_mangle_last_dec"));
    505 
    506 	# encrypting
    507 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    508 	&pshufb	("xmm0","xmm1");		# output permute
    509 	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
    510 	&add	($key,32);
    511 
    512 &set_label("schedule_mangle_last_dec");
    513 	&add	($key,-16);
    514 	&pxor	("xmm0",&QWP($k_s63,$const));
    515 	&call	("_vpaes_schedule_transform");	# output transform
    516 	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
    517 
    518 	# cleanup
    519 	&pxor	("xmm0","xmm0");
    520 	&pxor	("xmm1","xmm1");
    521 	&pxor	("xmm2","xmm2");
    522 	&pxor	("xmm3","xmm3");
    523 	&pxor	("xmm4","xmm4");
    524 	&pxor	("xmm5","xmm5");
    525 	&pxor	("xmm6","xmm6");
    526 	&pxor	("xmm7","xmm7");
    527 	&ret	();
    528 &function_end_B("_vpaes_schedule_core");
    529 
    530 ##
    531 ##  .aes_schedule_192_smear
    532 ##
    533 ##  Smear the short, low side in the 192-bit key schedule.
    534 ##
    535 ##  Inputs:
    536 ##    %xmm7: high side, b  a  x  y
    537 ##    %xmm6:  low side, d  c  0  0
    538 ##    %xmm13: 0
    539 ##
    540 ##  Outputs:
    541 ##    %xmm6: b+c+d  b+c  0  0
    542 ##    %xmm0: b+c+d  b+c  b  a
    543 ##
    544 &function_begin_B("_vpaes_schedule_192_smear");
    545 	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
    546 	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
    547 	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
    548 	&pxor	("xmm1","xmm1");
    549 	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
    550 	&movdqa	("xmm0","xmm6");
    551 	&movhlps("xmm6","xmm1");		# clobber low side with zeros
    552 	&ret	();
    553 &function_end_B("_vpaes_schedule_192_smear");
    554 
    555 ##
    556 ##  .aes_schedule_round
    557 ##
    558 ##  Runs one main round of the key schedule on %xmm0, %xmm7
    559 ##
    560 ##  Specifically, runs subbytes on the high dword of %xmm0
    561 ##  then rotates it by one byte and xors into the low dword of
    562 ##  %xmm7.
    563 ##
    564 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
    565 ##  next rcon.
    566 ##
    567 ##  Smears the dwords of %xmm7 by xoring the low into the
    568 ##  second low, result into third, result into highest.
    569 ##
    570 ##  Returns results in %xmm7 = %xmm0.
    571 ##  Clobbers %xmm1-%xmm5.
    572 ##
    573 &function_begin_B("_vpaes_schedule_round");
    574 	# extract rcon from xmm8
    575 	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
    576 	&pxor	("xmm1","xmm1");
    577 	&palignr("xmm1","xmm2",15);
    578 	&palignr("xmm2","xmm2",15);
    579 	&pxor	("xmm7","xmm1");
    580 
    581 	# rotate
    582 	&pshufd	("xmm0","xmm0",0xFF);
    583 	&palignr("xmm0","xmm0",1);
    584 
    585 	# fall through...
    586 	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
    587 
    588 	# low round: same as high round, but no rotation and no rcon.
    589 &set_label("_vpaes_schedule_low_round");
    590 	# smear xmm7
    591 	&movdqa	("xmm1","xmm7");
    592 	&pslldq	("xmm7",4);
    593 	&pxor	("xmm7","xmm1");
    594 	&movdqa	("xmm1","xmm7");
    595 	&pslldq	("xmm7",8);
    596 	&pxor	("xmm7","xmm1");
    597 	&pxor	("xmm7",&QWP($k_s63,$const));
    598 
    599 	# subbyte
    600 	&movdqa	("xmm4",&QWP($k_s0F,$const));
    601 	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
    602 	&movdqa	("xmm1","xmm4");	
    603 	&pandn	("xmm1","xmm0");
    604 	&psrld	("xmm1",4);			# 1 = i
    605 	&pand	("xmm0","xmm4");		# 0 = k
    606 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
    607 	&pshufb	("xmm2","xmm0");		# 2 = a/k
    608 	&pxor	("xmm0","xmm1");		# 0 = j
    609 	&movdqa	("xmm3","xmm5");		# 3 : 1/i
    610 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    611 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
    612 	&movdqa	("xmm4","xmm5");		# 4 : 1/j
    613 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    614 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
    615 	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
    616 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    617 	&pxor	("xmm2","xmm0");		# 2 = io
    618 	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
    619 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    620 	&pxor	("xmm3","xmm1");		# 3 = jo
    621 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
    622 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    623 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
    624 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    625 	&pxor	("xmm0","xmm4");		# 0 = sbox output
    626 
    627 	# add in smeared stuff
    628 	&pxor	("xmm0","xmm7");
    629 	&movdqa	("xmm7","xmm0");
    630 	&ret	();
    631 &function_end_B("_vpaes_schedule_round");
    632 
    633 ##
    634 ##  .aes_schedule_transform
    635 ##
    636 ##  Linear-transform %xmm0 according to tables at (%ebx)
    637 ##
    638 ##  Output in %xmm0
    639 ##  Clobbers %xmm1, %xmm2
    640 ##
    641 &function_begin_B("_vpaes_schedule_transform");
    642 	&movdqa	("xmm2",&QWP($k_s0F,$const));
    643 	&movdqa	("xmm1","xmm2");
    644 	&pandn	("xmm1","xmm0");
    645 	&psrld	("xmm1",4);
    646 	&pand	("xmm0","xmm2");
    647 	&movdqa	("xmm2",&QWP(0,$base));
    648 	&pshufb	("xmm2","xmm0");
    649 	&movdqa	("xmm0",&QWP(16,$base));
    650 	&pshufb	("xmm0","xmm1");
    651 	&pxor	("xmm0","xmm2");
    652 	&ret	();
    653 &function_end_B("_vpaes_schedule_transform");
    654 
    655 ##
    656 ##  .aes_schedule_mangle
    657 ##
    658 ##  Mangle xmm0 from (basis-transformed) standard version
    659 ##  to our version.
    660 ##
    661 ##  On encrypt,
    662 ##    xor with 0x63
    663 ##    multiply by circulant 0,1,1,1
    664 ##    apply shiftrows transform
    665 ##
    666 ##  On decrypt,
    667 ##    xor with 0x63
    668 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
    669 ##    deskew
    670 ##    apply shiftrows transform
    671 ##
    672 ##
    673 ##  Writes out to (%edx), and increments or decrements it
    674 ##  Keeps track of round number mod 4 in %ecx
    675 ##  Preserves xmm0
    676 ##  Clobbers xmm1-xmm5
    677 ##
    678 &function_begin_B("_vpaes_schedule_mangle");
    679 	&movdqa	("xmm4","xmm0");	# save xmm0 for later
    680 	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
    681 	&test	($out,$out);
    682 	&jnz	(&label("schedule_mangle_dec"));
    683 
    684 	# encrypting
    685 	&add	($key,16);
    686 	&pxor	("xmm4",&QWP($k_s63,$const));
    687 	&pshufb	("xmm4","xmm5");
    688 	&movdqa	("xmm3","xmm4");
    689 	&pshufb	("xmm4","xmm5");
    690 	&pxor	("xmm3","xmm4");
    691 	&pshufb	("xmm4","xmm5");
    692 	&pxor	("xmm3","xmm4");
    693 
    694 	&jmp	(&label("schedule_mangle_both"));
    695 
    696 &set_label("schedule_mangle_dec",16);
    697 	# inverse mix columns
    698 	&movdqa	("xmm2",&QWP($k_s0F,$const));
    699 	&lea	($inp,&DWP($k_dksd,$const));
    700 	&movdqa	("xmm1","xmm2");
    701 	&pandn	("xmm1","xmm4");
    702 	&psrld	("xmm1",4);			# 1 = hi
    703 	&pand	("xmm4","xmm2");		# 4 = lo
    704 
    705 	&movdqa	("xmm2",&QWP(0,$inp));
    706 	&pshufb	("xmm2","xmm4");
    707 	&movdqa	("xmm3",&QWP(0x10,$inp));
    708 	&pshufb	("xmm3","xmm1");
    709 	&pxor	("xmm3","xmm2");
    710 	&pshufb	("xmm3","xmm5");
    711 
    712 	&movdqa	("xmm2",&QWP(0x20,$inp));
    713 	&pshufb	("xmm2","xmm4");
    714 	&pxor	("xmm2","xmm3");
    715 	&movdqa	("xmm3",&QWP(0x30,$inp));
    716 	&pshufb	("xmm3","xmm1");
    717 	&pxor	("xmm3","xmm2");
    718 	&pshufb	("xmm3","xmm5");
    719 
    720 	&movdqa	("xmm2",&QWP(0x40,$inp));
    721 	&pshufb	("xmm2","xmm4");
    722 	&pxor	("xmm2","xmm3");
    723 	&movdqa	("xmm3",&QWP(0x50,$inp));
    724 	&pshufb	("xmm3","xmm1");
    725 	&pxor	("xmm3","xmm2");
    726 	&pshufb	("xmm3","xmm5");
    727 
    728 	&movdqa	("xmm2",&QWP(0x60,$inp));
    729 	&pshufb	("xmm2","xmm4");
    730 	&pxor	("xmm2","xmm3");
    731 	&movdqa	("xmm3",&QWP(0x70,$inp));
    732 	&pshufb	("xmm3","xmm1");
    733 	&pxor	("xmm3","xmm2");
    734 
    735 	&add	($key,-16);
    736 
    737 &set_label("schedule_mangle_both");
    738 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    739 	&pshufb	("xmm3","xmm1");
    740 	&add	($magic,-16);
    741 	&and	($magic,0x30);
    742 	&movdqu	(&QWP(0,$key),"xmm3");
    743 	&ret	();
    744 &function_end_B("_vpaes_schedule_mangle");
    745 
    746 #
    747 # Interface to OpenSSL
    748 #
    749 &function_begin("${PREFIX}_set_encrypt_key");
    750 	&mov	($inp,&wparam(0));		# inp
    751 	&lea	($base,&DWP(-56,"esp"));
    752 	&mov	($round,&wparam(1));		# bits
    753 	&and	($base,-16);
    754 	&mov	($key,&wparam(2));		# key
    755 	&xchg	($base,"esp");			# alloca
    756 	&mov	(&DWP(48,"esp"),$base);
    757 
    758 	&mov	($base,$round);
    759 	&shr	($base,5);
    760 	&add	($base,5);
    761 	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
    762 	&mov	($magic,0x30);
    763 	&mov	($out,0);
    764 
    765 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    766 	&call	("_vpaes_schedule_core");
    767 &set_label("pic_point");
    768 
    769 	&mov	("esp",&DWP(48,"esp"));
    770 	&xor	("eax","eax");
    771 &function_end("${PREFIX}_set_encrypt_key");
    772 
    773 &function_begin("${PREFIX}_set_decrypt_key");
    774 	&mov	($inp,&wparam(0));		# inp
    775 	&lea	($base,&DWP(-56,"esp"));
    776 	&mov	($round,&wparam(1));		# bits
    777 	&and	($base,-16);
    778 	&mov	($key,&wparam(2));		# key
    779 	&xchg	($base,"esp");			# alloca
    780 	&mov	(&DWP(48,"esp"),$base);
    781 
    782 	&mov	($base,$round);
    783 	&shr	($base,5);
    784 	&add	($base,5);
    785 	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
    786 	&shl	($base,4);
    787 	&lea	($key,&DWP(16,$key,$base));
    788 
    789 	&mov	($out,1);
    790 	&mov	($magic,$round);
    791 	&shr	($magic,1);
    792 	&and	($magic,32);
    793 	&xor	($magic,32);			# nbist==192?0:32;
    794 
    795 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    796 	&call	("_vpaes_schedule_core");
    797 &set_label("pic_point");
    798 
    799 	&mov	("esp",&DWP(48,"esp"));
    800 	&xor	("eax","eax");
    801 &function_end("${PREFIX}_set_decrypt_key");
    802 
    803 &function_begin("${PREFIX}_encrypt");
    804 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    805 	&call	("_vpaes_preheat");
    806 &set_label("pic_point");
    807 	&mov	($inp,&wparam(0));		# inp
    808 	&lea	($base,&DWP(-56,"esp"));
    809 	&mov	($out,&wparam(1));		# out
    810 	&and	($base,-16);
    811 	&mov	($key,&wparam(2));		# key
    812 	&xchg	($base,"esp");			# alloca
    813 	&mov	(&DWP(48,"esp"),$base);
    814 
    815 	&movdqu	("xmm0",&QWP(0,$inp));
    816 	&call	("_vpaes_encrypt_core");
    817 	&movdqu	(&QWP(0,$out),"xmm0");
    818 
    819 	&mov	("esp",&DWP(48,"esp"));
    820 &function_end("${PREFIX}_encrypt");
    821 
    822 &function_begin("${PREFIX}_decrypt");
    823 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    824 	&call	("_vpaes_preheat");
    825 &set_label("pic_point");
    826 	&mov	($inp,&wparam(0));		# inp
    827 	&lea	($base,&DWP(-56,"esp"));
    828 	&mov	($out,&wparam(1));		# out
    829 	&and	($base,-16);
    830 	&mov	($key,&wparam(2));		# key
    831 	&xchg	($base,"esp");			# alloca
    832 	&mov	(&DWP(48,"esp"),$base);
    833 
    834 	&movdqu	("xmm0",&QWP(0,$inp));
    835 	&call	("_vpaes_decrypt_core");
    836 	&movdqu	(&QWP(0,$out),"xmm0");
    837 
    838 	&mov	("esp",&DWP(48,"esp"));
    839 &function_end("${PREFIX}_decrypt");
    840 
    841 &function_begin("${PREFIX}_cbc_encrypt");
    842 	&mov	($inp,&wparam(0));		# inp
    843 	&mov	($out,&wparam(1));		# out
    844 	&mov	($round,&wparam(2));		# len
    845 	&mov	($key,&wparam(3));		# key
    846 	&sub	($round,16);
    847 	&jc	(&label("cbc_abort"));
    848 	&lea	($base,&DWP(-56,"esp"));
    849 	&mov	($const,&wparam(4));		# ivp
    850 	&and	($base,-16);
    851 	&mov	($magic,&wparam(5));		# enc
    852 	&xchg	($base,"esp");			# alloca
    853 	&movdqu	("xmm1",&QWP(0,$const));	# load IV
    854 	&sub	($out,$inp);
    855 	&mov	(&DWP(48,"esp"),$base);
    856 
    857 	&mov	(&DWP(0,"esp"),$out);		# save out
    858 	&mov	(&DWP(4,"esp"),$key)		# save key
    859 	&mov	(&DWP(8,"esp"),$const);		# save ivp
    860 	&mov	($out,$round);			# $out works as $len
    861 
    862 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    863 	&call	("_vpaes_preheat");
    864 &set_label("pic_point");
    865 	&cmp	($magic,0);
    866 	&je	(&label("cbc_dec_loop"));
    867 	&jmp	(&label("cbc_enc_loop"));
    868 
    869 &set_label("cbc_enc_loop",16);
    870 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
    871 	&pxor	("xmm0","xmm1");		# inp^=iv
    872 	&call	("_vpaes_encrypt_core");
    873 	&mov	($base,&DWP(0,"esp"));		# restore out
    874 	&mov	($key,&DWP(4,"esp"));		# restore key
    875 	&movdqa	("xmm1","xmm0");
    876 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
    877 	&lea	($inp,&DWP(16,$inp));
    878 	&sub	($out,16);
    879 	&jnc	(&label("cbc_enc_loop"));
    880 	&jmp	(&label("cbc_done"));
    881 
    882 &set_label("cbc_dec_loop",16);
    883 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
    884 	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
    885 	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
    886 	&call	("_vpaes_decrypt_core");
    887 	&mov	($base,&DWP(0,"esp"));		# restore out
    888 	&mov	($key,&DWP(4,"esp"));		# restore key
    889 	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
    890 	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
    891 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
    892 	&lea	($inp,&DWP(16,$inp));
    893 	&sub	($out,16);
    894 	&jnc	(&label("cbc_dec_loop"));
    895 
    896 &set_label("cbc_done");
    897 	&mov	($base,&DWP(8,"esp"));		# restore ivp
    898 	&mov	("esp",&DWP(48,"esp"));
    899 	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
    900 &set_label("cbc_abort");
    901 &function_end("${PREFIX}_cbc_encrypt");
    902 
    903 &asm_finish();
    904