Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 ######################################################################
     11 ## Constant-time SSSE3 AES core implementation.
     12 ## version 0.1
     13 ##
     14 ## By Mike Hamburg (Stanford University), 2009
     15 ## Public domain.
     16 ##
     17 ## For details see http://shiftleft.org/papers/vector_aes/ and
     18 ## http://crypto.stanford.edu/vpaes/.
     19 
     20 ######################################################################
     21 # September 2011.
     22 #
     23 # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
     24 # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
     25 # doesn't handle partial vectors (doesn't have to if called from
     26 # EVP only). "Drop-in" implies that this module doesn't share key
     27 # schedule structure with the original nor does it make assumption
     28 # about its alignment...
     29 #
     30 # Performance summary. aes-586.pl column lists large-block CBC
     31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
     32 # byte processed with 128-bit key, and vpaes-x86.pl column - [also
     33 # large-block CBC] encrypt/decrypt.
     34 #
     35 #		aes-586.pl		vpaes-x86.pl
     36 #
     37 # Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
     38 # Nehalem	27.9/40.4/18.1		10.2/11.9
     39 # Atom		70.7/92.1/60.1		61.1/75.4(***)
     40 # Silvermont	45.4/62.9/24.1		49.2/61.1(***)
     41 #
     42 # (*)	"Hyper-threading" in the context refers rather to cache shared
     43 #	among multiple cores, than to specifically Intel HTT. As vast
     44 #	majority of contemporary cores share cache, slower code path
     45 #	is common place. In other words "with-hyper-threading-off"
     46 #	results are presented mostly for reference purposes.
     47 #
     48 # (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
     49 #
     50 # (***)	Less impressive improvement on Core 2 and Atom is due to slow
     51 #	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
     52 #	and +15% on Atom (as implied, over "hyper-threading-safe"
     53 #	code path).
     54 #
     55 #						<appro (at] openssl.org>
     56 
     57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     58 push(@INC,"${dir}","${dir}../../../perlasm");
     59 require "x86asm.pl";
     60 
     61 $output = pop;
     62 open OUT,">$output";
     63 *STDOUT=*OUT;
     64 
     65 &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
     66 
     67 $PREFIX="vpaes";
     68 
     69 my  ($round, $base, $magic, $key, $const, $inp, $out)=
     70     ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
     71 
     72 &preprocessor_ifndef("NDEBUG")
     73 &external_label("BORINGSSL_function_hit");
     74 &preprocessor_endif();
     75 &static_label("_vpaes_consts");
     76 &static_label("_vpaes_schedule_low_round");
     77 
     78 &set_label("_vpaes_consts",64);
     79 $k_inv=-0x30;		# inv, inva
     80 	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
     81 	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
     82 
     83 $k_s0F=-0x10;		# s0F
     84 	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
     85 
     86 $k_ipt=0x00;		# input transform (lo, hi)
     87 	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
     88 	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
     89 
     90 $k_sb1=0x20;		# sb1u, sb1t
     91 	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
     92 	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
     93 $k_sb2=0x40;		# sb2u, sb2t
     94 	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
     95 	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
     96 $k_sbo=0x60;		# sbou, sbot
     97 	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
     98 	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
     99 
    100 $k_mc_forward=0x80;	# mc_forward
    101 	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
    102 	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
    103 	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
    104 	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
    105 
    106 $k_mc_backward=0xc0;	# mc_backward
    107 	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
    108 	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
    109 	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
    110 	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
    111 
    112 $k_sr=0x100;		# sr
    113 	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
    114 	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
    115 	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
    116 	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
    117 
    118 $k_rcon=0x140;		# rcon
    119 	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
    120 
    121 $k_s63=0x150;		# s63: all equal to 0x63 transformed
    122 	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
    123 
    124 $k_opt=0x160;		# output transform
    125 	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
    126 	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
    127 
    128 $k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
    129 	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
    130 	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
    131 ##
    132 ##  Decryption stuff
    133 ##  Key schedule constants
    134 ##
    135 $k_dksd=0x1a0;		# decryption key schedule: invskew x*D
    136 	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
    137 	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
    138 $k_dksb=0x1c0;		# decryption key schedule: invskew x*B
    139 	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
    140 	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
    141 $k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
    142 	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
    143 	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
    144 $k_dks9=0x200;		# decryption key schedule: invskew x*9
    145 	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
    146 	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
    147 
    148 ##
    149 ##  Decryption stuff
    150 ##  Round function constants
    151 ##
    152 $k_dipt=0x220;		# decryption input transform
    153 	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
    154 	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
    155 
    156 $k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
    157 	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
    158 	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
    159 $k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
    160 	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
    161 	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
    162 $k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
    163 	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
    164 	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
    165 $k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
    166 	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
    167 	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
    168 $k_dsbo=0x2c0;		# decryption sbox final output
    169 	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
    170 	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
    171 &asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
    172 &align	(64);
    173 
    174 &function_begin_B("_vpaes_preheat");
    175 	&add	($const,&DWP(0,"esp"));
    176 	&movdqa	("xmm7",&QWP($k_inv,$const));
    177 	&movdqa	("xmm6",&QWP($k_s0F,$const));
    178 	&ret	();
    179 &function_end_B("_vpaes_preheat");
    180 
    181 ##
    182 ##  _aes_encrypt_core
    183 ##
    184 ##  AES-encrypt %xmm0.
    185 ##
    186 ##  Inputs:
    187 ##     %xmm0 = input
    188 ##     %xmm6-%xmm7 as in _vpaes_preheat
    189 ##    (%edx) = scheduled keys
    190 ##
    191 ##  Output in %xmm0
    192 ##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
    193 ##
    194 ##
    195 &function_begin_B("_vpaes_encrypt_core");
    196 	&mov	($magic,16);
    197 	&mov	($round,&DWP(240,$key));
    198 	&movdqa	("xmm1","xmm6")
    199 	&movdqa	("xmm2",&QWP($k_ipt,$const));
    200 	&pandn	("xmm1","xmm0");
    201 	&pand	("xmm0","xmm6");
    202 	&movdqu	("xmm5",&QWP(0,$key));
    203 	&pshufb	("xmm2","xmm0");
    204 	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
    205 	&pxor	("xmm2","xmm5");
    206 	&psrld	("xmm1",4);
    207 	&add	($key,16);
    208 	&pshufb	("xmm0","xmm1");
    209 	&lea	($base,&DWP($k_mc_backward,$const));
    210 	&pxor	("xmm0","xmm2");
    211 	&jmp	(&label("enc_entry"));
    212 
    213 
    214 &set_label("enc_loop",16);
    215 	# middle of middle round
    216 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
    217 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
    218 	&pshufb	("xmm4","xmm2");		# 4 = sb1u
    219 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    220 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
    221 	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
    222 	&pxor	("xmm0","xmm4");		# 0 = A
    223 	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
    224 	&pshufb	("xmm5","xmm2");		# 4 = sb2u
    225 	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
    226 	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
    227 	&pshufb	("xmm2","xmm3");		# 2 = sb2t
    228 	&movdqa	("xmm3","xmm0");		# 3 = A
    229 	&pxor	("xmm2","xmm5");		# 2 = 2A
    230 	&pshufb	("xmm0","xmm1");		# 0 = B
    231 	&add	($key,16);			# next key
    232 	&pxor	("xmm0","xmm2");		# 0 = 2A+B
    233 	&pshufb	("xmm3","xmm4");		# 3 = D
    234 	&add	($magic,16);			# next mc
    235 	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
    236 	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
    237 	&and	($magic,0x30);			# ... mod 4
    238 	&sub	($round,1);			# nr--
    239 	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
    240 
    241 &set_label("enc_entry");
    242 	# top of round
    243 	&movdqa	("xmm1","xmm6");		# 1 : i
    244 	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
    245 	&pandn	("xmm1","xmm0");		# 1 = i<<4
    246 	&psrld	("xmm1",4);			# 1 = i
    247 	&pand	("xmm0","xmm6");		# 0 = k
    248 	&pshufb	("xmm5","xmm0");		# 2 = a/k
    249 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
    250 	&pxor	("xmm0","xmm1");		# 0 = j
    251 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    252 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
    253 	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
    254 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    255 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
    256 	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
    257 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    258 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
    259 	&pxor	("xmm2","xmm0");		# 2 = io
    260 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    261 	&movdqu	("xmm5",&QWP(0,$key));
    262 	&pxor	("xmm3","xmm1");		# 3 = jo
    263 	&jnz	(&label("enc_loop"));
    264 
    265 	# middle of last round
    266 	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
    267 	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
    268 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    269 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
    270 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    271 	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
    272 	&pxor	("xmm0","xmm4");		# 0 = A
    273 	&pshufb	("xmm0","xmm1");
    274 	&ret	();
    275 &function_end_B("_vpaes_encrypt_core");
    276 
    277 ##
    278 ##  Decryption core
    279 ##
    280 ##  Same API as encryption core.
    281 ##
    282 &function_begin_B("_vpaes_decrypt_core");
    283 	&lea	($base,&DWP($k_dsbd,$const));
    284 	&mov	($round,&DWP(240,$key));
    285 	&movdqa	("xmm1","xmm6");
    286 	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
    287 	&pandn	("xmm1","xmm0");
    288 	&mov	($magic,$round);
    289 	&psrld	("xmm1",4)
    290 	&movdqu	("xmm5",&QWP(0,$key));
    291 	&shl	($magic,4);
    292 	&pand	("xmm0","xmm6");
    293 	&pshufb	("xmm2","xmm0");
    294 	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
    295 	&xor	($magic,0x30);
    296 	&pshufb	("xmm0","xmm1");
    297 	&and	($magic,0x30);
    298 	&pxor	("xmm2","xmm5");
    299 	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
    300 	&pxor	("xmm0","xmm2");
    301 	&add	($key,16);
    302 	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
    303 	&jmp	(&label("dec_entry"));
    304 
    305 &set_label("dec_loop",16);
    306 ##
    307 ##  Inverse mix columns
    308 ##
    309 	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
    310 	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
    311 	&pshufb	("xmm4","xmm2");		# 4 = sb9u
    312 	&pshufb	("xmm1","xmm3");		# 0 = sb9t
    313 	&pxor	("xmm0","xmm4");
    314 	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
    315 	&pxor	("xmm0","xmm1");		# 0 = ch
    316 	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt
    317 
    318 	&pshufb	("xmm4","xmm2");		# 4 = sbdu
    319 	&pshufb	("xmm0","xmm5");		# MC ch
    320 	&pshufb	("xmm1","xmm3");		# 0 = sbdt
    321 	&pxor	("xmm0","xmm4");		# 4 = ch
    322 	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
    323 	&pxor	("xmm0","xmm1");		# 0 = ch
    324 	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
    325 
    326 	&pshufb	("xmm4","xmm2");		# 4 = sbbu
    327 	&pshufb	("xmm0","xmm5");		# MC ch
    328 	&pshufb	("xmm1","xmm3");		# 0 = sbbt
    329 	&pxor	("xmm0","xmm4");		# 4 = ch
    330 	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
    331 	&pxor	("xmm0","xmm1");		# 0 = ch
    332 	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet
    333 
    334 	&pshufb	("xmm4","xmm2");		# 4 = sbeu
    335 	&pshufb	("xmm0","xmm5");		# MC ch
    336 	&pshufb	("xmm1","xmm3");		# 0 = sbet
    337 	&pxor	("xmm0","xmm4");		# 4 = ch
    338 	&add	($key,16);			# next round key
    339 	&palignr("xmm5","xmm5",12);
    340 	&pxor	("xmm0","xmm1");		# 0 = ch
    341 	&sub	($round,1);			# nr--
    342 
    343 &set_label("dec_entry");
    344 	# top of round
    345 	&movdqa	("xmm1","xmm6");		# 1 : i
    346 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
    347 	&pandn	("xmm1","xmm0");		# 1 = i<<4
    348 	&pand	("xmm0","xmm6");		# 0 = k
    349 	&psrld	("xmm1",4);			# 1 = i
    350 	&pshufb	("xmm2","xmm0");		# 2 = a/k
    351 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
    352 	&pxor	("xmm0","xmm1");		# 0 = j
    353 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    354 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
    355 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
    356 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    357 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
    358 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
    359 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    360 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
    361 	&pxor	("xmm2","xmm0");		# 2 = io
    362 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    363 	&movdqu	("xmm0",&QWP(0,$key));
    364 	&pxor	("xmm3","xmm1");		# 3 = jo
    365 	&jnz	(&label("dec_loop"));
    366 
    367 	# middle of last round
    368 	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
    369 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    370 	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
    371 	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
    372 	&movdqa	("xmm2",&QWP(0,$magic));
    373 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    374 	&pxor	("xmm0","xmm4");		# 0 = A
    375 	&pshufb	("xmm0","xmm2");
    376 	&ret	();
    377 &function_end_B("_vpaes_decrypt_core");
    378 
    379 ########################################################
    380 ##                                                    ##
    381 ##                  AES key schedule                  ##
    382 ##                                                    ##
    383 ########################################################
    384 &function_begin_B("_vpaes_schedule_core");
    385 	&add	($const,&DWP(0,"esp"));
    386 	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
    387 	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
    388 
    389 	# input transform
    390 	&movdqa	("xmm3","xmm0");
    391 	&lea	($base,&DWP($k_ipt,$const));
    392 	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
    393 	&call	("_vpaes_schedule_transform");
    394 	&movdqa	("xmm7","xmm0");
    395 
    396 	&test	($out,$out);
    397 	&jnz	(&label("schedule_am_decrypting"));
    398 
    399 	# encrypting, output zeroth round key after transform
    400 	&movdqu	(&QWP(0,$key),"xmm0");
    401 	&jmp	(&label("schedule_go"));
    402 
    403 &set_label("schedule_am_decrypting");
    404 	# decrypting, output zeroth round key after shiftrows
    405 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    406 	&pshufb	("xmm3","xmm1");
    407 	&movdqu	(&QWP(0,$key),"xmm3");
    408 	&xor	($magic,0x30);
    409 
    410 &set_label("schedule_go");
    411 	&cmp	($round,192);
    412 	&ja	(&label("schedule_256"));
    413 	&je	(&label("schedule_192"));
    414 	# 128: fall though
    415 
    416 ##
    417 ##  .schedule_128
    418 ##
    419 ##  128-bit specific part of key schedule.
    420 ##
    421 ##  This schedule is really simple, because all its parts
    422 ##  are accomplished by the subroutines.
    423 ##
    424 &set_label("schedule_128");
    425 	&mov	($round,10);
    426 
    427 &set_label("loop_schedule_128");
    428 	&call	("_vpaes_schedule_round");
    429 	&dec	($round);
    430 	&jz	(&label("schedule_mangle_last"));
    431 	&call	("_vpaes_schedule_mangle");	# write output
    432 	&jmp	(&label("loop_schedule_128"));
    433 
    434 ##
    435 ##  .aes_schedule_192
    436 ##
    437 ##  192-bit specific part of key schedule.
    438 ##
    439 ##  The main body of this schedule is the same as the 128-bit
    440 ##  schedule, but with more smearing.  The long, high side is
    441 ##  stored in %xmm7 as before, and the short, low side is in
    442 ##  the high bits of %xmm6.
    443 ##
    444 ##  This schedule is somewhat nastier, however, because each
    445 ##  round produces 192 bits of key material, or 1.5 round keys.
    446 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
    447 ##  keys.
    448 ##
    449 &set_label("schedule_192",16);
    450 	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
    451 	&call	("_vpaes_schedule_transform");	# input transform
    452 	&movdqa	("xmm6","xmm0");		# save short part
    453 	&pxor	("xmm4","xmm4");		# clear 4
    454 	&movhlps("xmm6","xmm4");		# clobber low side with zeros
    455 	&mov	($round,4);
    456 
    457 &set_label("loop_schedule_192");
    458 	&call	("_vpaes_schedule_round");
    459 	&palignr("xmm0","xmm6",8);
    460 	&call	("_vpaes_schedule_mangle");	# save key n
    461 	&call	("_vpaes_schedule_192_smear");
    462 	&call	("_vpaes_schedule_mangle");	# save key n+1
    463 	&call	("_vpaes_schedule_round");
    464 	&dec	($round);
    465 	&jz	(&label("schedule_mangle_last"));
    466 	&call	("_vpaes_schedule_mangle");	# save key n+2
    467 	&call	("_vpaes_schedule_192_smear");
    468 	&jmp	(&label("loop_schedule_192"));
    469 
    470 ##
    471 ##  .aes_schedule_256
    472 ##
    473 ##  256-bit specific part of key schedule.
    474 ##
    475 ##  The structure here is very similar to the 128-bit
    476 ##  schedule, but with an additional "low side" in
    477 ##  %xmm6.  The low side's rounds are the same as the
    478 ##  high side's, except no rcon and no rotation.
    479 ##
    480 &set_label("schedule_256",16);
    481 	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
    482 	&call	("_vpaes_schedule_transform");	# input transform
    483 	&mov	($round,7);
    484 
    485 &set_label("loop_schedule_256");
    486 	&call	("_vpaes_schedule_mangle");	# output low result
    487 	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
    488 
    489 	# high round
    490 	&call	("_vpaes_schedule_round");
    491 	&dec	($round);
    492 	&jz	(&label("schedule_mangle_last"));
    493 	&call	("_vpaes_schedule_mangle");
    494 
    495 	# low round. swap xmm7 and xmm6
    496 	&pshufd	("xmm0","xmm0",0xFF);
    497 	&movdqa	(&QWP(20,"esp"),"xmm7");
    498 	&movdqa	("xmm7","xmm6");
    499 	&call	("_vpaes_schedule_low_round");
    500 	&movdqa	("xmm7",&QWP(20,"esp"));
    501 
    502 	&jmp	(&label("loop_schedule_256"));
    503 
    504 ##
    505 ##  .aes_schedule_mangle_last
    506 ##
    507 ##  Mangler for last round of key schedule
    508 ##  Mangles %xmm0
    509 ##    when encrypting, outputs out(%xmm0) ^ 63
    510 ##    when decrypting, outputs unskew(%xmm0)
    511 ##
    512 ##  Always called right before return... jumps to cleanup and exits
    513 ##
    514 &set_label("schedule_mangle_last",16);
    515 	# schedule last round key from xmm0
    516 	&lea	($base,&DWP($k_deskew,$const));
    517 	&test	($out,$out);
    518 	&jnz	(&label("schedule_mangle_last_dec"));
    519 
    520 	# encrypting
    521 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    522 	&pshufb	("xmm0","xmm1");		# output permute
    523 	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
    524 	&add	($key,32);
    525 
    526 &set_label("schedule_mangle_last_dec");
    527 	&add	($key,-16);
    528 	&pxor	("xmm0",&QWP($k_s63,$const));
    529 	&call	("_vpaes_schedule_transform");	# output transform
    530 	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
    531 
    532 	# cleanup
    533 	&pxor	("xmm0","xmm0");
    534 	&pxor	("xmm1","xmm1");
    535 	&pxor	("xmm2","xmm2");
    536 	&pxor	("xmm3","xmm3");
    537 	&pxor	("xmm4","xmm4");
    538 	&pxor	("xmm5","xmm5");
    539 	&pxor	("xmm6","xmm6");
    540 	&pxor	("xmm7","xmm7");
    541 	&ret	();
    542 &function_end_B("_vpaes_schedule_core");
    543 
    544 ##
    545 ##  .aes_schedule_192_smear
    546 ##
    547 ##  Smear the short, low side in the 192-bit key schedule.
    548 ##
    549 ##  Inputs:
    550 ##    %xmm7: high side, b  a  x  y
    551 ##    %xmm6:  low side, d  c  0  0
    552 ##    %xmm13: 0
    553 ##
    554 ##  Outputs:
    555 ##    %xmm6: b+c+d  b+c  0  0
    556 ##    %xmm0: b+c+d  b+c  b  a
    557 ##
    558 &function_begin_B("_vpaes_schedule_192_smear");
    559 	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
    560 	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
    561 	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
    562 	&pxor	("xmm1","xmm1");
    563 	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
    564 	&movdqa	("xmm0","xmm6");
    565 	&movhlps("xmm6","xmm1");		# clobber low side with zeros
    566 	&ret	();
    567 &function_end_B("_vpaes_schedule_192_smear");
    568 
    569 ##
    570 ##  .aes_schedule_round
    571 ##
    572 ##  Runs one main round of the key schedule on %xmm0, %xmm7
    573 ##
    574 ##  Specifically, runs subbytes on the high dword of %xmm0
    575 ##  then rotates it by one byte and xors into the low dword of
    576 ##  %xmm7.
    577 ##
    578 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
    579 ##  next rcon.
    580 ##
    581 ##  Smears the dwords of %xmm7 by xoring the low into the
    582 ##  second low, result into third, result into highest.
    583 ##
    584 ##  Returns results in %xmm7 = %xmm0.
    585 ##  Clobbers %xmm1-%xmm5.
    586 ##
    587 &function_begin_B("_vpaes_schedule_round");
    588 	# extract rcon from xmm8
    589 	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
    590 	&pxor	("xmm1","xmm1");
    591 	&palignr("xmm1","xmm2",15);
    592 	&palignr("xmm2","xmm2",15);
    593 	&pxor	("xmm7","xmm1");
    594 
    595 	# rotate
    596 	&pshufd	("xmm0","xmm0",0xFF);
    597 	&palignr("xmm0","xmm0",1);
    598 
    599 	# fall through...
    600 	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
    601 
    602 	# low round: same as high round, but no rotation and no rcon.
    603 &set_label("_vpaes_schedule_low_round");
    604 	# smear xmm7
    605 	&movdqa	("xmm1","xmm7");
    606 	&pslldq	("xmm7",4);
    607 	&pxor	("xmm7","xmm1");
    608 	&movdqa	("xmm1","xmm7");
    609 	&pslldq	("xmm7",8);
    610 	&pxor	("xmm7","xmm1");
    611 	&pxor	("xmm7",&QWP($k_s63,$const));
    612 
    613 	# subbyte
    614 	&movdqa	("xmm4",&QWP($k_s0F,$const));
    615 	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
    616 	&movdqa	("xmm1","xmm4");
    617 	&pandn	("xmm1","xmm0");
    618 	&psrld	("xmm1",4);			# 1 = i
    619 	&pand	("xmm0","xmm4");		# 0 = k
    620 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
    621 	&pshufb	("xmm2","xmm0");		# 2 = a/k
    622 	&pxor	("xmm0","xmm1");		# 0 = j
    623 	&movdqa	("xmm3","xmm5");		# 3 : 1/i
    624 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
    625 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
    626 	&movdqa	("xmm4","xmm5");		# 4 : 1/j
    627 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
    628 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
    629 	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
    630 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
    631 	&pxor	("xmm2","xmm0");		# 2 = io
    632 	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
    633 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
    634 	&pxor	("xmm3","xmm1");		# 3 = jo
    635 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
    636 	&pshufb	("xmm4","xmm2");		# 4 = sbou
    637 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
    638 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
    639 	&pxor	("xmm0","xmm4");		# 0 = sbox output
    640 
    641 	# add in smeared stuff
    642 	&pxor	("xmm0","xmm7");
    643 	&movdqa	("xmm7","xmm0");
    644 	&ret	();
    645 &function_end_B("_vpaes_schedule_round");
    646 
    647 ##
    648 ##  .aes_schedule_transform
    649 ##
    650 ##  Linear-transform %xmm0 according to tables at (%ebx)
    651 ##
    652 ##  Output in %xmm0
    653 ##  Clobbers %xmm1, %xmm2
    654 ##
    655 &function_begin_B("_vpaes_schedule_transform");
    656 	&movdqa	("xmm2",&QWP($k_s0F,$const));
    657 	&movdqa	("xmm1","xmm2");
    658 	&pandn	("xmm1","xmm0");
    659 	&psrld	("xmm1",4);
    660 	&pand	("xmm0","xmm2");
    661 	&movdqa	("xmm2",&QWP(0,$base));
    662 	&pshufb	("xmm2","xmm0");
    663 	&movdqa	("xmm0",&QWP(16,$base));
    664 	&pshufb	("xmm0","xmm1");
    665 	&pxor	("xmm0","xmm2");
    666 	&ret	();
    667 &function_end_B("_vpaes_schedule_transform");
    668 
    669 ##
    670 ##  .aes_schedule_mangle
    671 ##
    672 ##  Mangle xmm0 from (basis-transformed) standard version
    673 ##  to our version.
    674 ##
    675 ##  On encrypt,
    676 ##    xor with 0x63
    677 ##    multiply by circulant 0,1,1,1
    678 ##    apply shiftrows transform
    679 ##
    680 ##  On decrypt,
    681 ##    xor with 0x63
    682 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
    683 ##    deskew
    684 ##    apply shiftrows transform
    685 ##
    686 ##
    687 ##  Writes out to (%edx), and increments or decrements it
    688 ##  Keeps track of round number mod 4 in %ecx
    689 ##  Preserves xmm0
    690 ##  Clobbers xmm1-xmm5
    691 ##
    692 &function_begin_B("_vpaes_schedule_mangle");
    693 	&movdqa	("xmm4","xmm0");	# save xmm0 for later
    694 	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
    695 	&test	($out,$out);
    696 	&jnz	(&label("schedule_mangle_dec"));
    697 
    698 	# encrypting
    699 	&add	($key,16);
    700 	&pxor	("xmm4",&QWP($k_s63,$const));
    701 	&pshufb	("xmm4","xmm5");
    702 	&movdqa	("xmm3","xmm4");
    703 	&pshufb	("xmm4","xmm5");
    704 	&pxor	("xmm3","xmm4");
    705 	&pshufb	("xmm4","xmm5");
    706 	&pxor	("xmm3","xmm4");
    707 
    708 	&jmp	(&label("schedule_mangle_both"));
    709 
    710 &set_label("schedule_mangle_dec",16);
    711 	# inverse mix columns
    712 	&movdqa	("xmm2",&QWP($k_s0F,$const));
    713 	&lea	($inp,&DWP($k_dksd,$const));
    714 	&movdqa	("xmm1","xmm2");
    715 	&pandn	("xmm1","xmm4");
    716 	&psrld	("xmm1",4);			# 1 = hi
    717 	&pand	("xmm4","xmm2");		# 4 = lo
    718 
    719 	&movdqa	("xmm2",&QWP(0,$inp));
    720 	&pshufb	("xmm2","xmm4");
    721 	&movdqa	("xmm3",&QWP(0x10,$inp));
    722 	&pshufb	("xmm3","xmm1");
    723 	&pxor	("xmm3","xmm2");
    724 	&pshufb	("xmm3","xmm5");
    725 
    726 	&movdqa	("xmm2",&QWP(0x20,$inp));
    727 	&pshufb	("xmm2","xmm4");
    728 	&pxor	("xmm2","xmm3");
    729 	&movdqa	("xmm3",&QWP(0x30,$inp));
    730 	&pshufb	("xmm3","xmm1");
    731 	&pxor	("xmm3","xmm2");
    732 	&pshufb	("xmm3","xmm5");
    733 
    734 	&movdqa	("xmm2",&QWP(0x40,$inp));
    735 	&pshufb	("xmm2","xmm4");
    736 	&pxor	("xmm2","xmm3");
    737 	&movdqa	("xmm3",&QWP(0x50,$inp));
    738 	&pshufb	("xmm3","xmm1");
    739 	&pxor	("xmm3","xmm2");
    740 	&pshufb	("xmm3","xmm5");
    741 
    742 	&movdqa	("xmm2",&QWP(0x60,$inp));
    743 	&pshufb	("xmm2","xmm4");
    744 	&pxor	("xmm2","xmm3");
    745 	&movdqa	("xmm3",&QWP(0x70,$inp));
    746 	&pshufb	("xmm3","xmm1");
    747 	&pxor	("xmm3","xmm2");
    748 
    749 	&add	($key,-16);
    750 
    751 &set_label("schedule_mangle_both");
    752 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
    753 	&pshufb	("xmm3","xmm1");
    754 	&add	($magic,-16);
    755 	&and	($magic,0x30);
    756 	&movdqu	(&QWP(0,$key),"xmm3");
    757 	&ret	();
    758 &function_end_B("_vpaes_schedule_mangle");
    759 
    760 #
    761 # Interface to OpenSSL
    762 #
    763 &function_begin("${PREFIX}_set_encrypt_key");
    764 	record_function_hit(5);
    765 
    766 	&mov	($inp,&wparam(0));		# inp
    767 	&lea	($base,&DWP(-56,"esp"));
    768 	&mov	($round,&wparam(1));		# bits
    769 	&and	($base,-16);
    770 	&mov	($key,&wparam(2));		# key
    771 	&xchg	($base,"esp");			# alloca
    772 	&mov	(&DWP(48,"esp"),$base);
    773 
    774 	&mov	($base,$round);
    775 	&shr	($base,5);
    776 	&add	($base,5);
    777 	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
    778 	&mov	($magic,0x30);
    779 	&mov	($out,0);
    780 
    781 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    782 	&call	("_vpaes_schedule_core");
    783 &set_label("pic_point");
    784 
    785 	&mov	("esp",&DWP(48,"esp"));
    786 	&xor	("eax","eax");
    787 &function_end("${PREFIX}_set_encrypt_key");
    788 
    789 &function_begin("${PREFIX}_set_decrypt_key");
    790 	&mov	($inp,&wparam(0));		# inp
    791 	&lea	($base,&DWP(-56,"esp"));
    792 	&mov	($round,&wparam(1));		# bits
    793 	&and	($base,-16);
    794 	&mov	($key,&wparam(2));		# key
    795 	&xchg	($base,"esp");			# alloca
    796 	&mov	(&DWP(48,"esp"),$base);
    797 
    798 	&mov	($base,$round);
    799 	&shr	($base,5);
    800 	&add	($base,5);
    801 	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
    802 	&shl	($base,4);
    803 	&lea	($key,&DWP(16,$key,$base));
    804 
    805 	&mov	($out,1);
    806 	&mov	($magic,$round);
    807 	&shr	($magic,1);
    808 	&and	($magic,32);
    809 	&xor	($magic,32);			# nbist==192?0:32;
    810 
    811 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    812 	&call	("_vpaes_schedule_core");
    813 &set_label("pic_point");
    814 
    815 	&mov	("esp",&DWP(48,"esp"));
    816 	&xor	("eax","eax");
    817 &function_end("${PREFIX}_set_decrypt_key");
    818 
    819 &function_begin("${PREFIX}_encrypt");
    820 	record_function_hit(4);
    821 
    822 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    823 	&call	("_vpaes_preheat");
    824 &set_label("pic_point");
    825 	&mov	($inp,&wparam(0));		# inp
    826 	&lea	($base,&DWP(-56,"esp"));
    827 	&mov	($out,&wparam(1));		# out
    828 	&and	($base,-16);
    829 	&mov	($key,&wparam(2));		# key
    830 	&xchg	($base,"esp");			# alloca
    831 	&mov	(&DWP(48,"esp"),$base);
    832 
    833 	&movdqu	("xmm0",&QWP(0,$inp));
    834 	&call	("_vpaes_encrypt_core");
    835 	&movdqu	(&QWP(0,$out),"xmm0");
    836 
    837 	&mov	("esp",&DWP(48,"esp"));
    838 &function_end("${PREFIX}_encrypt");
    839 
    840 &function_begin("${PREFIX}_decrypt");
    841 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    842 	&call	("_vpaes_preheat");
    843 &set_label("pic_point");
    844 	&mov	($inp,&wparam(0));		# inp
    845 	&lea	($base,&DWP(-56,"esp"));
    846 	&mov	($out,&wparam(1));		# out
    847 	&and	($base,-16);
    848 	&mov	($key,&wparam(2));		# key
    849 	&xchg	($base,"esp");			# alloca
    850 	&mov	(&DWP(48,"esp"),$base);
    851 
    852 	&movdqu	("xmm0",&QWP(0,$inp));
    853 	&call	("_vpaes_decrypt_core");
    854 	&movdqu	(&QWP(0,$out),"xmm0");
    855 
    856 	&mov	("esp",&DWP(48,"esp"));
    857 &function_end("${PREFIX}_decrypt");
    858 
    859 &function_begin("${PREFIX}_cbc_encrypt");
    860 	&mov	($inp,&wparam(0));		# inp
    861 	&mov	($out,&wparam(1));		# out
    862 	&mov	($round,&wparam(2));		# len
    863 	&mov	($key,&wparam(3));		# key
    864 	&sub	($round,16);
    865 	&jc	(&label("cbc_abort"));
    866 	&lea	($base,&DWP(-56,"esp"));
    867 	&mov	($const,&wparam(4));		# ivp
    868 	&and	($base,-16);
    869 	&mov	($magic,&wparam(5));		# enc
    870 	&xchg	($base,"esp");			# alloca
    871 	&movdqu	("xmm1",&QWP(0,$const));	# load IV
    872 	&sub	($out,$inp);
    873 	&mov	(&DWP(48,"esp"),$base);
    874 
    875 	&mov	(&DWP(0,"esp"),$out);		# save out
    876 	&mov	(&DWP(4,"esp"),$key)		# save key
    877 	&mov	(&DWP(8,"esp"),$const);		# save ivp
    878 	&mov	($out,$round);			# $out works as $len
    879 
    880 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
    881 	&call	("_vpaes_preheat");
    882 &set_label("pic_point");
    883 	&cmp	($magic,0);
    884 	&je	(&label("cbc_dec_loop"));
    885 	&jmp	(&label("cbc_enc_loop"));
    886 
    887 &set_label("cbc_enc_loop",16);
    888 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
    889 	&pxor	("xmm0","xmm1");		# inp^=iv
    890 	&call	("_vpaes_encrypt_core");
    891 	&mov	($base,&DWP(0,"esp"));		# restore out
    892 	&mov	($key,&DWP(4,"esp"));		# restore key
    893 	&movdqa	("xmm1","xmm0");
    894 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
    895 	&lea	($inp,&DWP(16,$inp));
    896 	&sub	($out,16);
    897 	&jnc	(&label("cbc_enc_loop"));
    898 	&jmp	(&label("cbc_done"));
    899 
    900 &set_label("cbc_dec_loop",16);
    901 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
    902 	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
    903 	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
    904 	&call	("_vpaes_decrypt_core");
    905 	&mov	($base,&DWP(0,"esp"));		# restore out
    906 	&mov	($key,&DWP(4,"esp"));		# restore key
    907 	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
    908 	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
    909 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
    910 	&lea	($inp,&DWP(16,$inp));
    911 	&sub	($out,16);
    912 	&jnc	(&label("cbc_dec_loop"));
    913 
    914 &set_label("cbc_done");
    915 	&mov	($base,&DWP(8,"esp"));		# restore ivp
    916 	&mov	("esp",&DWP(48,"esp"));
    917 	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
    918 &set_label("cbc_abort");
    919 &function_end("${PREFIX}_cbc_encrypt");
    920 
    921 &asm_finish();
    922 
    923 close STDOUT;
    924