1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # To start with see corresponding paragraph in aesni-x86_64.pl... 18 # Instead of filling table similar to one found there I've chosen to 19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20 # The simplified table below represents 32-bit performance relative 21 # to 64-bit one in every given point. Ratios vary for different 22 # encryption modes, therefore interval values. 23 # 24 # 16-byte 64-byte 256-byte 1-KB 8-KB 25 # 53-67% 67-84% 91-94% 95-98% 97-99.5% 26 # 27 # Lower ratios for smaller block sizes are perfectly understandable, 28 # because function call overhead is higher in 32-bit mode. Largest 29 # 8-KB block performance is virtually same: 32-bit code is less than 30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32 # January 2011 33 # 34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module 35 # interleaves at most 6 aes[enc|dec] instructions, because there are 36 # not enough registers for 8x interleave [which should be optimal for 37 # Sandy Bridge]. Actually, performance results for 6x interleave 38 # factor presented in aesni-x86_64.pl (except for CTR) are for this 39 # module. 40 41 # April 2011 42 # 43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 47 # generates drop-in replacement for 48 # crypto/aes/asm/aes-586.pl:-) 49 $inline=1; # inline _aesni_[en|de]crypt 50 51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 52 push(@INC,"${dir}","${dir}../../perlasm"); 53 require "x86asm.pl"; 54 55 &asm_init($ARGV[0],$0); 56 57 if ($PREFIX eq "aesni") { $movekey=*movups; } 58 else { $movekey=*movups; } 59 60 $len="eax"; 61 $rounds="ecx"; 62 $key="edx"; 63 $inp="esi"; 64 $out="edi"; 65 $rounds_="ebx"; # backup copy for $rounds 66 $key_="ebp"; # backup copy for $key 67 68 $rndkey0="xmm0"; 69 $rndkey1="xmm1"; 70 $inout0="xmm2"; 71 $inout1="xmm3"; 72 $inout2="xmm4"; 73 $inout3="xmm5"; $in1="xmm5"; 74 $inout4="xmm6"; $in0="xmm6"; 75 $inout5="xmm7"; $ivec="xmm7"; 76 77 # AESNI extenstion 78 sub aeskeygenassist 79 { my($dst,$src,$imm)=@_; 80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 82 } 83 sub aescommon 84 { my($opcodelet,$dst,$src)=@_; 85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 87 } 88 sub aesimc { aescommon(0xdb,@_); } 89 sub aesenc { aescommon(0xdc,@_); } 90 sub aesenclast { aescommon(0xdd,@_); } 91 sub aesdec { aescommon(0xde,@_); } 92 sub aesdeclast { aescommon(0xdf,@_); } 93 95 # Inline version of internal aesni_[en|de]crypt1 96 { my $sn; 97 sub aesni_inline_generate1 98 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 99 $sn++; 100 101 &$movekey ($rndkey0,&QWP(0,$key)); 102 &$movekey ($rndkey1,&QWP(16,$key)); 103 &xorps ($ivec,$rndkey0) if (defined($ivec)); 104 &lea ($key,&DWP(32,$key)); 105 &xorps ($inout,$ivec) if (defined($ivec)); 106 &xorps ($inout,$rndkey0) if (!defined($ivec)); 107 &set_label("${p}1_loop_$sn"); 108 eval"&aes${p} ($inout,$rndkey1)"; 109 &dec ($rounds); 110 &$movekey ($rndkey1,&QWP(0,$key)); 111 &lea ($key,&DWP(16,$key)); 112 &jnz (&label("${p}1_loop_$sn")); 113 eval"&aes${p}last ($inout,$rndkey1)"; 114 }} 115 116 sub aesni_generate1 # fully unrolled loop 117 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 118 119 &function_begin_B("_aesni_${p}rypt1"); 120 &movups ($rndkey0,&QWP(0,$key)); 121 &$movekey ($rndkey1,&QWP(0x10,$key)); 122 &xorps ($inout,$rndkey0); 123 &$movekey ($rndkey0,&QWP(0x20,$key)); 124 &lea ($key,&DWP(0x30,$key)); 125 &cmp ($rounds,11); 126 &jb (&label("${p}128")); 127 &lea ($key,&DWP(0x20,$key)); 128 &je (&label("${p}192")); 129 &lea ($key,&DWP(0x20,$key)); 130 eval"&aes${p} ($inout,$rndkey1)"; 131 &$movekey ($rndkey1,&QWP(-0x40,$key)); 132 eval"&aes${p} ($inout,$rndkey0)"; 133 &$movekey ($rndkey0,&QWP(-0x30,$key)); 134 &set_label("${p}192"); 135 eval"&aes${p} ($inout,$rndkey1)"; 136 &$movekey ($rndkey1,&QWP(-0x20,$key)); 137 eval"&aes${p} ($inout,$rndkey0)"; 138 &$movekey ($rndkey0,&QWP(-0x10,$key)); 139 &set_label("${p}128"); 140 eval"&aes${p} ($inout,$rndkey1)"; 141 &$movekey ($rndkey1,&QWP(0,$key)); 142 eval"&aes${p} ($inout,$rndkey0)"; 143 &$movekey ($rndkey0,&QWP(0x10,$key)); 144 eval"&aes${p} ($inout,$rndkey1)"; 145 &$movekey ($rndkey1,&QWP(0x20,$key)); 146 eval"&aes${p} ($inout,$rndkey0)"; 147 &$movekey ($rndkey0,&QWP(0x30,$key)); 148 eval"&aes${p} ($inout,$rndkey1)"; 149 &$movekey ($rndkey1,&QWP(0x40,$key)); 150 eval"&aes${p} ($inout,$rndkey0)"; 151 &$movekey ($rndkey0,&QWP(0x50,$key)); 152 eval"&aes${p} ($inout,$rndkey1)"; 153 &$movekey ($rndkey1,&QWP(0x60,$key)); 154 eval"&aes${p} ($inout,$rndkey0)"; 155 &$movekey ($rndkey0,&QWP(0x70,$key)); 156 eval"&aes${p} ($inout,$rndkey1)"; 157 eval"&aes${p}last ($inout,$rndkey0)"; 158 &ret(); 159 &function_end_B("_aesni_${p}rypt1"); 160 } 161 163 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 164 &aesni_generate1("enc") if (!$inline); 165 &function_begin_B("${PREFIX}_encrypt"); 166 &mov ("eax",&wparam(0)); 167 &mov ($key,&wparam(2)); 168 &movups ($inout0,&QWP(0,"eax")); 169 &mov ($rounds,&DWP(240,$key)); 170 &mov ("eax",&wparam(1)); 171 if ($inline) 172 { &aesni_inline_generate1("enc"); } 173 else 174 { &call ("_aesni_encrypt1"); } 175 &movups (&QWP(0,"eax"),$inout0); 176 &ret (); 177 &function_end_B("${PREFIX}_encrypt"); 178 179 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 180 &aesni_generate1("dec") if(!$inline); 181 &function_begin_B("${PREFIX}_decrypt"); 182 &mov ("eax",&wparam(0)); 183 &mov ($key,&wparam(2)); 184 &movups ($inout0,&QWP(0,"eax")); 185 &mov ($rounds,&DWP(240,$key)); 186 &mov ("eax",&wparam(1)); 187 if ($inline) 188 { &aesni_inline_generate1("dec"); } 189 else 190 { &call ("_aesni_decrypt1"); } 191 &movups (&QWP(0,"eax"),$inout0); 192 &ret (); 193 &function_end_B("${PREFIX}_decrypt"); 194 195 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 196 # factor. Why 3x subroutine were originally used in loops? Even though 197 # aes[enc|dec] latency was originally 6, it could be scheduled only 198 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 199 # utilization, i.e. when subroutine's throughput is virtually same as 200 # of non-interleaved subroutine [for number of input blocks up to 3]. 201 # This is why it makes no sense to implement 2x subroutine. 202 # aes[enc|dec] latency in next processor generation is 8, but the 203 # instructions can be scheduled every cycle. Optimal interleave for 204 # new processor is therefore 8x, but it's unfeasible to accommodate it 205 # in XMM registers addreassable in 32-bit mode and therefore 6x is 206 # used instead... 207 208 sub aesni_generate3 209 { my $p=shift; 210 211 &function_begin_B("_aesni_${p}rypt3"); 212 &$movekey ($rndkey0,&QWP(0,$key)); 213 &shr ($rounds,1); 214 &$movekey ($rndkey1,&QWP(16,$key)); 215 &lea ($key,&DWP(32,$key)); 216 &xorps ($inout0,$rndkey0); 217 &pxor ($inout1,$rndkey0); 218 &pxor ($inout2,$rndkey0); 219 &$movekey ($rndkey0,&QWP(0,$key)); 220 221 &set_label("${p}3_loop"); 222 eval"&aes${p} ($inout0,$rndkey1)"; 223 eval"&aes${p} ($inout1,$rndkey1)"; 224 &dec ($rounds); 225 eval"&aes${p} ($inout2,$rndkey1)"; 226 &$movekey ($rndkey1,&QWP(16,$key)); 227 eval"&aes${p} ($inout0,$rndkey0)"; 228 eval"&aes${p} ($inout1,$rndkey0)"; 229 &lea ($key,&DWP(32,$key)); 230 eval"&aes${p} ($inout2,$rndkey0)"; 231 &$movekey ($rndkey0,&QWP(0,$key)); 232 &jnz (&label("${p}3_loop")); 233 eval"&aes${p} ($inout0,$rndkey1)"; 234 eval"&aes${p} ($inout1,$rndkey1)"; 235 eval"&aes${p} ($inout2,$rndkey1)"; 236 eval"&aes${p}last ($inout0,$rndkey0)"; 237 eval"&aes${p}last ($inout1,$rndkey0)"; 238 eval"&aes${p}last ($inout2,$rndkey0)"; 239 &ret(); 240 &function_end_B("_aesni_${p}rypt3"); 241 } 242 243 # 4x interleave is implemented to improve small block performance, 244 # most notably [and naturally] 4 block by ~30%. One can argue that one 245 # should have implemented 5x as well, but improvement would be <20%, 246 # so it's not worth it... 247 sub aesni_generate4 248 { my $p=shift; 249 250 &function_begin_B("_aesni_${p}rypt4"); 251 &$movekey ($rndkey0,&QWP(0,$key)); 252 &$movekey ($rndkey1,&QWP(16,$key)); 253 &shr ($rounds,1); 254 &lea ($key,&DWP(32,$key)); 255 &xorps ($inout0,$rndkey0); 256 &pxor ($inout1,$rndkey0); 257 &pxor ($inout2,$rndkey0); 258 &pxor ($inout3,$rndkey0); 259 &$movekey ($rndkey0,&QWP(0,$key)); 260 261 &set_label("${p}4_loop"); 262 eval"&aes${p} ($inout0,$rndkey1)"; 263 eval"&aes${p} ($inout1,$rndkey1)"; 264 &dec ($rounds); 265 eval"&aes${p} ($inout2,$rndkey1)"; 266 eval"&aes${p} ($inout3,$rndkey1)"; 267 &$movekey ($rndkey1,&QWP(16,$key)); 268 eval"&aes${p} ($inout0,$rndkey0)"; 269 eval"&aes${p} ($inout1,$rndkey0)"; 270 &lea ($key,&DWP(32,$key)); 271 eval"&aes${p} ($inout2,$rndkey0)"; 272 eval"&aes${p} ($inout3,$rndkey0)"; 273 &$movekey ($rndkey0,&QWP(0,$key)); 274 &jnz (&label("${p}4_loop")); 275 276 eval"&aes${p} ($inout0,$rndkey1)"; 277 eval"&aes${p} ($inout1,$rndkey1)"; 278 eval"&aes${p} ($inout2,$rndkey1)"; 279 eval"&aes${p} ($inout3,$rndkey1)"; 280 eval"&aes${p}last ($inout0,$rndkey0)"; 281 eval"&aes${p}last ($inout1,$rndkey0)"; 282 eval"&aes${p}last ($inout2,$rndkey0)"; 283 eval"&aes${p}last ($inout3,$rndkey0)"; 284 &ret(); 285 &function_end_B("_aesni_${p}rypt4"); 286 } 287 288 sub aesni_generate6 289 { my $p=shift; 290 291 &function_begin_B("_aesni_${p}rypt6"); 292 &static_label("_aesni_${p}rypt6_enter"); 293 &$movekey ($rndkey0,&QWP(0,$key)); 294 &shr ($rounds,1); 295 &$movekey ($rndkey1,&QWP(16,$key)); 296 &lea ($key,&DWP(32,$key)); 297 &xorps ($inout0,$rndkey0); 298 &pxor ($inout1,$rndkey0); # pxor does better here 299 eval"&aes${p} ($inout0,$rndkey1)"; 300 &pxor ($inout2,$rndkey0); 301 eval"&aes${p} ($inout1,$rndkey1)"; 302 &pxor ($inout3,$rndkey0); 303 &dec ($rounds); 304 eval"&aes${p} ($inout2,$rndkey1)"; 305 &pxor ($inout4,$rndkey0); 306 eval"&aes${p} ($inout3,$rndkey1)"; 307 &pxor ($inout5,$rndkey0); 308 eval"&aes${p} ($inout4,$rndkey1)"; 309 &$movekey ($rndkey0,&QWP(0,$key)); 310 eval"&aes${p} ($inout5,$rndkey1)"; 311 &jmp (&label("_aesni_${p}rypt6_enter")); 312 313 &set_label("${p}6_loop",16); 314 eval"&aes${p} ($inout0,$rndkey1)"; 315 eval"&aes${p} ($inout1,$rndkey1)"; 316 &dec ($rounds); 317 eval"&aes${p} ($inout2,$rndkey1)"; 318 eval"&aes${p} ($inout3,$rndkey1)"; 319 eval"&aes${p} ($inout4,$rndkey1)"; 320 eval"&aes${p} ($inout5,$rndkey1)"; 321 &set_label("_aesni_${p}rypt6_enter",16); 322 &$movekey ($rndkey1,&QWP(16,$key)); 323 eval"&aes${p} ($inout0,$rndkey0)"; 324 eval"&aes${p} ($inout1,$rndkey0)"; 325 &lea ($key,&DWP(32,$key)); 326 eval"&aes${p} ($inout2,$rndkey0)"; 327 eval"&aes${p} ($inout3,$rndkey0)"; 328 eval"&aes${p} ($inout4,$rndkey0)"; 329 eval"&aes${p} ($inout5,$rndkey0)"; 330 &$movekey ($rndkey0,&QWP(0,$key)); 331 &jnz (&label("${p}6_loop")); 332 333 eval"&aes${p} ($inout0,$rndkey1)"; 334 eval"&aes${p} ($inout1,$rndkey1)"; 335 eval"&aes${p} ($inout2,$rndkey1)"; 336 eval"&aes${p} ($inout3,$rndkey1)"; 337 eval"&aes${p} ($inout4,$rndkey1)"; 338 eval"&aes${p} ($inout5,$rndkey1)"; 339 eval"&aes${p}last ($inout0,$rndkey0)"; 340 eval"&aes${p}last ($inout1,$rndkey0)"; 341 eval"&aes${p}last ($inout2,$rndkey0)"; 342 eval"&aes${p}last ($inout3,$rndkey0)"; 343 eval"&aes${p}last ($inout4,$rndkey0)"; 344 eval"&aes${p}last ($inout5,$rndkey0)"; 345 &ret(); 346 &function_end_B("_aesni_${p}rypt6"); 347 } 348 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 349 &aesni_generate3("dec"); 350 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 351 &aesni_generate4("dec"); 352 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 353 &aesni_generate6("dec"); 354 356 if ($PREFIX eq "aesni") { 357 ###################################################################### 358 # void aesni_ecb_encrypt (const void *in, void *out, 359 # size_t length, const AES_KEY *key, 360 # int enc); 361 &function_begin("aesni_ecb_encrypt"); 362 &mov ($inp,&wparam(0)); 363 &mov ($out,&wparam(1)); 364 &mov ($len,&wparam(2)); 365 &mov ($key,&wparam(3)); 366 &mov ($rounds_,&wparam(4)); 367 &and ($len,-16); 368 &jz (&label("ecb_ret")); 369 &mov ($rounds,&DWP(240,$key)); 370 &test ($rounds_,$rounds_); 371 &jz (&label("ecb_decrypt")); 372 373 &mov ($key_,$key); # backup $key 374 &mov ($rounds_,$rounds); # backup $rounds 375 &cmp ($len,0x60); 376 &jb (&label("ecb_enc_tail")); 377 378 &movdqu ($inout0,&QWP(0,$inp)); 379 &movdqu ($inout1,&QWP(0x10,$inp)); 380 &movdqu ($inout2,&QWP(0x20,$inp)); 381 &movdqu ($inout3,&QWP(0x30,$inp)); 382 &movdqu ($inout4,&QWP(0x40,$inp)); 383 &movdqu ($inout5,&QWP(0x50,$inp)); 384 &lea ($inp,&DWP(0x60,$inp)); 385 &sub ($len,0x60); 386 &jmp (&label("ecb_enc_loop6_enter")); 387 388 &set_label("ecb_enc_loop6",16); 389 &movups (&QWP(0,$out),$inout0); 390 &movdqu ($inout0,&QWP(0,$inp)); 391 &movups (&QWP(0x10,$out),$inout1); 392 &movdqu ($inout1,&QWP(0x10,$inp)); 393 &movups (&QWP(0x20,$out),$inout2); 394 &movdqu ($inout2,&QWP(0x20,$inp)); 395 &movups (&QWP(0x30,$out),$inout3); 396 &movdqu ($inout3,&QWP(0x30,$inp)); 397 &movups (&QWP(0x40,$out),$inout4); 398 &movdqu ($inout4,&QWP(0x40,$inp)); 399 &movups (&QWP(0x50,$out),$inout5); 400 &lea ($out,&DWP(0x60,$out)); 401 &movdqu ($inout5,&QWP(0x50,$inp)); 402 &lea ($inp,&DWP(0x60,$inp)); 403 &set_label("ecb_enc_loop6_enter"); 404 405 &call ("_aesni_encrypt6"); 406 407 &mov ($key,$key_); # restore $key 408 &mov ($rounds,$rounds_); # restore $rounds 409 &sub ($len,0x60); 410 &jnc (&label("ecb_enc_loop6")); 411 412 &movups (&QWP(0,$out),$inout0); 413 &movups (&QWP(0x10,$out),$inout1); 414 &movups (&QWP(0x20,$out),$inout2); 415 &movups (&QWP(0x30,$out),$inout3); 416 &movups (&QWP(0x40,$out),$inout4); 417 &movups (&QWP(0x50,$out),$inout5); 418 &lea ($out,&DWP(0x60,$out)); 419 &add ($len,0x60); 420 &jz (&label("ecb_ret")); 421 422 &set_label("ecb_enc_tail"); 423 &movups ($inout0,&QWP(0,$inp)); 424 &cmp ($len,0x20); 425 &jb (&label("ecb_enc_one")); 426 &movups ($inout1,&QWP(0x10,$inp)); 427 &je (&label("ecb_enc_two")); 428 &movups ($inout2,&QWP(0x20,$inp)); 429 &cmp ($len,0x40); 430 &jb (&label("ecb_enc_three")); 431 &movups ($inout3,&QWP(0x30,$inp)); 432 &je (&label("ecb_enc_four")); 433 &movups ($inout4,&QWP(0x40,$inp)); 434 &xorps ($inout5,$inout5); 435 &call ("_aesni_encrypt6"); 436 &movups (&QWP(0,$out),$inout0); 437 &movups (&QWP(0x10,$out),$inout1); 438 &movups (&QWP(0x20,$out),$inout2); 439 &movups (&QWP(0x30,$out),$inout3); 440 &movups (&QWP(0x40,$out),$inout4); 441 jmp (&label("ecb_ret")); 442 443 &set_label("ecb_enc_one",16); 444 if ($inline) 445 { &aesni_inline_generate1("enc"); } 446 else 447 { &call ("_aesni_encrypt1"); } 448 &movups (&QWP(0,$out),$inout0); 449 &jmp (&label("ecb_ret")); 450 451 &set_label("ecb_enc_two",16); 452 &xorps ($inout2,$inout2); 453 &call ("_aesni_encrypt3"); 454 &movups (&QWP(0,$out),$inout0); 455 &movups (&QWP(0x10,$out),$inout1); 456 &jmp (&label("ecb_ret")); 457 458 &set_label("ecb_enc_three",16); 459 &call ("_aesni_encrypt3"); 460 &movups (&QWP(0,$out),$inout0); 461 &movups (&QWP(0x10,$out),$inout1); 462 &movups (&QWP(0x20,$out),$inout2); 463 &jmp (&label("ecb_ret")); 464 465 &set_label("ecb_enc_four",16); 466 &call ("_aesni_encrypt4"); 467 &movups (&QWP(0,$out),$inout0); 468 &movups (&QWP(0x10,$out),$inout1); 469 &movups (&QWP(0x20,$out),$inout2); 470 &movups (&QWP(0x30,$out),$inout3); 471 &jmp (&label("ecb_ret")); 472 ###################################################################### 473 &set_label("ecb_decrypt",16); 474 &mov ($key_,$key); # backup $key 475 &mov ($rounds_,$rounds); # backup $rounds 476 &cmp ($len,0x60); 477 &jb (&label("ecb_dec_tail")); 478 479 &movdqu ($inout0,&QWP(0,$inp)); 480 &movdqu ($inout1,&QWP(0x10,$inp)); 481 &movdqu ($inout2,&QWP(0x20,$inp)); 482 &movdqu ($inout3,&QWP(0x30,$inp)); 483 &movdqu ($inout4,&QWP(0x40,$inp)); 484 &movdqu ($inout5,&QWP(0x50,$inp)); 485 &lea ($inp,&DWP(0x60,$inp)); 486 &sub ($len,0x60); 487 &jmp (&label("ecb_dec_loop6_enter")); 488 489 &set_label("ecb_dec_loop6",16); 490 &movups (&QWP(0,$out),$inout0); 491 &movdqu ($inout0,&QWP(0,$inp)); 492 &movups (&QWP(0x10,$out),$inout1); 493 &movdqu ($inout1,&QWP(0x10,$inp)); 494 &movups (&QWP(0x20,$out),$inout2); 495 &movdqu ($inout2,&QWP(0x20,$inp)); 496 &movups (&QWP(0x30,$out),$inout3); 497 &movdqu ($inout3,&QWP(0x30,$inp)); 498 &movups (&QWP(0x40,$out),$inout4); 499 &movdqu ($inout4,&QWP(0x40,$inp)); 500 &movups (&QWP(0x50,$out),$inout5); 501 &lea ($out,&DWP(0x60,$out)); 502 &movdqu ($inout5,&QWP(0x50,$inp)); 503 &lea ($inp,&DWP(0x60,$inp)); 504 &set_label("ecb_dec_loop6_enter"); 505 506 &call ("_aesni_decrypt6"); 507 508 &mov ($key,$key_); # restore $key 509 &mov ($rounds,$rounds_); # restore $rounds 510 &sub ($len,0x60); 511 &jnc (&label("ecb_dec_loop6")); 512 513 &movups (&QWP(0,$out),$inout0); 514 &movups (&QWP(0x10,$out),$inout1); 515 &movups (&QWP(0x20,$out),$inout2); 516 &movups (&QWP(0x30,$out),$inout3); 517 &movups (&QWP(0x40,$out),$inout4); 518 &movups (&QWP(0x50,$out),$inout5); 519 &lea ($out,&DWP(0x60,$out)); 520 &add ($len,0x60); 521 &jz (&label("ecb_ret")); 522 523 &set_label("ecb_dec_tail"); 524 &movups ($inout0,&QWP(0,$inp)); 525 &cmp ($len,0x20); 526 &jb (&label("ecb_dec_one")); 527 &movups ($inout1,&QWP(0x10,$inp)); 528 &je (&label("ecb_dec_two")); 529 &movups ($inout2,&QWP(0x20,$inp)); 530 &cmp ($len,0x40); 531 &jb (&label("ecb_dec_three")); 532 &movups ($inout3,&QWP(0x30,$inp)); 533 &je (&label("ecb_dec_four")); 534 &movups ($inout4,&QWP(0x40,$inp)); 535 &xorps ($inout5,$inout5); 536 &call ("_aesni_decrypt6"); 537 &movups (&QWP(0,$out),$inout0); 538 &movups (&QWP(0x10,$out),$inout1); 539 &movups (&QWP(0x20,$out),$inout2); 540 &movups (&QWP(0x30,$out),$inout3); 541 &movups (&QWP(0x40,$out),$inout4); 542 &jmp (&label("ecb_ret")); 543 544 &set_label("ecb_dec_one",16); 545 if ($inline) 546 { &aesni_inline_generate1("dec"); } 547 else 548 { &call ("_aesni_decrypt1"); } 549 &movups (&QWP(0,$out),$inout0); 550 &jmp (&label("ecb_ret")); 551 552 &set_label("ecb_dec_two",16); 553 &xorps ($inout2,$inout2); 554 &call ("_aesni_decrypt3"); 555 &movups (&QWP(0,$out),$inout0); 556 &movups (&QWP(0x10,$out),$inout1); 557 &jmp (&label("ecb_ret")); 558 559 &set_label("ecb_dec_three",16); 560 &call ("_aesni_decrypt3"); 561 &movups (&QWP(0,$out),$inout0); 562 &movups (&QWP(0x10,$out),$inout1); 563 &movups (&QWP(0x20,$out),$inout2); 564 &jmp (&label("ecb_ret")); 565 566 &set_label("ecb_dec_four",16); 567 &call ("_aesni_decrypt4"); 568 &movups (&QWP(0,$out),$inout0); 569 &movups (&QWP(0x10,$out),$inout1); 570 &movups (&QWP(0x20,$out),$inout2); 571 &movups (&QWP(0x30,$out),$inout3); 572 573 &set_label("ecb_ret"); 574 &function_end("aesni_ecb_encrypt"); 575 577 ###################################################################### 578 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 579 # size_t blocks, const AES_KEY *key, 580 # const char *ivec,char *cmac); 581 # 582 # Handles only complete blocks, operates on 64-bit counter and 583 # does not update *ivec! Nor does it finalize CMAC value 584 # (see engine/eng_aesni.c for details) 585 # 586 { my $cmac=$inout1; 587 &function_begin("aesni_ccm64_encrypt_blocks"); 588 &mov ($inp,&wparam(0)); 589 &mov ($out,&wparam(1)); 590 &mov ($len,&wparam(2)); 591 &mov ($key,&wparam(3)); 592 &mov ($rounds_,&wparam(4)); 593 &mov ($rounds,&wparam(5)); 594 &mov ($key_,"esp"); 595 &sub ("esp",60); 596 &and ("esp",-16); # align stack 597 &mov (&DWP(48,"esp"),$key_); 598 599 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 600 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 601 &mov ($rounds,&DWP(240,$key)); 602 603 # compose byte-swap control mask for pshufb on stack 604 &mov (&DWP(0,"esp"),0x0c0d0e0f); 605 &mov (&DWP(4,"esp"),0x08090a0b); 606 &mov (&DWP(8,"esp"),0x04050607); 607 &mov (&DWP(12,"esp"),0x00010203); 608 609 # compose counter increment vector on stack 610 &mov ($rounds_,1); 611 &xor ($key_,$key_); 612 &mov (&DWP(16,"esp"),$rounds_); 613 &mov (&DWP(20,"esp"),$key_); 614 &mov (&DWP(24,"esp"),$key_); 615 &mov (&DWP(28,"esp"),$key_); 616 617 &shr ($rounds,1); 618 &lea ($key_,&DWP(0,$key)); 619 &movdqa ($inout3,&QWP(0,"esp")); 620 &movdqa ($inout0,$ivec); 621 &mov ($rounds_,$rounds); 622 &pshufb ($ivec,$inout3); 623 624 &set_label("ccm64_enc_outer"); 625 &$movekey ($rndkey0,&QWP(0,$key_)); 626 &mov ($rounds,$rounds_); 627 &movups ($in0,&QWP(0,$inp)); 628 629 &xorps ($inout0,$rndkey0); 630 &$movekey ($rndkey1,&QWP(16,$key_)); 631 &xorps ($rndkey0,$in0); 632 &lea ($key,&DWP(32,$key_)); 633 &xorps ($cmac,$rndkey0); # cmac^=inp 634 &$movekey ($rndkey0,&QWP(0,$key)); 635 636 &set_label("ccm64_enc2_loop"); 637 &aesenc ($inout0,$rndkey1); 638 &dec ($rounds); 639 &aesenc ($cmac,$rndkey1); 640 &$movekey ($rndkey1,&QWP(16,$key)); 641 &aesenc ($inout0,$rndkey0); 642 &lea ($key,&DWP(32,$key)); 643 &aesenc ($cmac,$rndkey0); 644 &$movekey ($rndkey0,&QWP(0,$key)); 645 &jnz (&label("ccm64_enc2_loop")); 646 &aesenc ($inout0,$rndkey1); 647 &aesenc ($cmac,$rndkey1); 648 &paddq ($ivec,&QWP(16,"esp")); 649 &aesenclast ($inout0,$rndkey0); 650 &aesenclast ($cmac,$rndkey0); 651 652 &dec ($len); 653 &lea ($inp,&DWP(16,$inp)); 654 &xorps ($in0,$inout0); # inp^=E(ivec) 655 &movdqa ($inout0,$ivec); 656 &movups (&QWP(0,$out),$in0); # save output 657 &lea ($out,&DWP(16,$out)); 658 &pshufb ($inout0,$inout3); 659 &jnz (&label("ccm64_enc_outer")); 660 661 &mov ("esp",&DWP(48,"esp")); 662 &mov ($out,&wparam(5)); 663 &movups (&QWP(0,$out),$cmac); 664 &function_end("aesni_ccm64_encrypt_blocks"); 665 666 &function_begin("aesni_ccm64_decrypt_blocks"); 667 &mov ($inp,&wparam(0)); 668 &mov ($out,&wparam(1)); 669 &mov ($len,&wparam(2)); 670 &mov ($key,&wparam(3)); 671 &mov ($rounds_,&wparam(4)); 672 &mov ($rounds,&wparam(5)); 673 &mov ($key_,"esp"); 674 &sub ("esp",60); 675 &and ("esp",-16); # align stack 676 &mov (&DWP(48,"esp"),$key_); 677 678 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 679 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 680 &mov ($rounds,&DWP(240,$key)); 681 682 # compose byte-swap control mask for pshufb on stack 683 &mov (&DWP(0,"esp"),0x0c0d0e0f); 684 &mov (&DWP(4,"esp"),0x08090a0b); 685 &mov (&DWP(8,"esp"),0x04050607); 686 &mov (&DWP(12,"esp"),0x00010203); 687 688 # compose counter increment vector on stack 689 &mov ($rounds_,1); 690 &xor ($key_,$key_); 691 &mov (&DWP(16,"esp"),$rounds_); 692 &mov (&DWP(20,"esp"),$key_); 693 &mov (&DWP(24,"esp"),$key_); 694 &mov (&DWP(28,"esp"),$key_); 695 696 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 697 &movdqa ($inout0,$ivec); 698 699 &mov ($key_,$key); 700 &mov ($rounds_,$rounds); 701 702 &pshufb ($ivec,$inout3); 703 if ($inline) 704 { &aesni_inline_generate1("enc"); } 705 else 706 { &call ("_aesni_encrypt1"); } 707 &movups ($in0,&QWP(0,$inp)); # load inp 708 &paddq ($ivec,&QWP(16,"esp")); 709 &lea ($inp,&QWP(16,$inp)); 710 &jmp (&label("ccm64_dec_outer")); 711 712 &set_label("ccm64_dec_outer",16); 713 &xorps ($in0,$inout0); # inp ^= E(ivec) 714 &movdqa ($inout0,$ivec); 715 &mov ($rounds,$rounds_); 716 &movups (&QWP(0,$out),$in0); # save output 717 &lea ($out,&DWP(16,$out)); 718 &pshufb ($inout0,$inout3); 719 720 &sub ($len,1); 721 &jz (&label("ccm64_dec_break")); 722 723 &$movekey ($rndkey0,&QWP(0,$key_)); 724 &shr ($rounds,1); 725 &$movekey ($rndkey1,&QWP(16,$key_)); 726 &xorps ($in0,$rndkey0); 727 &lea ($key,&DWP(32,$key_)); 728 &xorps ($inout0,$rndkey0); 729 &xorps ($cmac,$in0); # cmac^=out 730 &$movekey ($rndkey0,&QWP(0,$key)); 731 732 &set_label("ccm64_dec2_loop"); 733 &aesenc ($inout0,$rndkey1); 734 &dec ($rounds); 735 &aesenc ($cmac,$rndkey1); 736 &$movekey ($rndkey1,&QWP(16,$key)); 737 &aesenc ($inout0,$rndkey0); 738 &lea ($key,&DWP(32,$key)); 739 &aesenc ($cmac,$rndkey0); 740 &$movekey ($rndkey0,&QWP(0,$key)); 741 &jnz (&label("ccm64_dec2_loop")); 742 &movups ($in0,&QWP(0,$inp)); # load inp 743 &paddq ($ivec,&QWP(16,"esp")); 744 &aesenc ($inout0,$rndkey1); 745 &aesenc ($cmac,$rndkey1); 746 &lea ($inp,&QWP(16,$inp)); 747 &aesenclast ($inout0,$rndkey0); 748 &aesenclast ($cmac,$rndkey0); 749 &jmp (&label("ccm64_dec_outer")); 750 751 &set_label("ccm64_dec_break",16); 752 &mov ($key,$key_); 753 if ($inline) 754 { &aesni_inline_generate1("enc",$cmac,$in0); } 755 else 756 { &call ("_aesni_encrypt1",$cmac); } 757 758 &mov ("esp",&DWP(48,"esp")); 759 &mov ($out,&wparam(5)); 760 &movups (&QWP(0,$out),$cmac); 761 &function_end("aesni_ccm64_decrypt_blocks"); 762 } 763 765 ###################################################################### 766 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 767 # size_t blocks, const AES_KEY *key, 768 # const char *ivec); 769 # 770 # Handles only complete blocks, operates on 32-bit counter and 771 # does not update *ivec! (see engine/eng_aesni.c for details) 772 # 773 # stack layout: 774 # 0 pshufb mask 775 # 16 vector addend: 0,6,6,6 776 # 32 counter-less ivec 777 # 48 1st triplet of counter vector 778 # 64 2nd triplet of counter vector 779 # 80 saved %esp 780 781 &function_begin("aesni_ctr32_encrypt_blocks"); 782 &mov ($inp,&wparam(0)); 783 &mov ($out,&wparam(1)); 784 &mov ($len,&wparam(2)); 785 &mov ($key,&wparam(3)); 786 &mov ($rounds_,&wparam(4)); 787 &mov ($key_,"esp"); 788 &sub ("esp",88); 789 &and ("esp",-16); # align stack 790 &mov (&DWP(80,"esp"),$key_); 791 792 &cmp ($len,1); 793 &je (&label("ctr32_one_shortcut")); 794 795 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 796 797 # compose byte-swap control mask for pshufb on stack 798 &mov (&DWP(0,"esp"),0x0c0d0e0f); 799 &mov (&DWP(4,"esp"),0x08090a0b); 800 &mov (&DWP(8,"esp"),0x04050607); 801 &mov (&DWP(12,"esp"),0x00010203); 802 803 # compose counter increment vector on stack 804 &mov ($rounds,6); 805 &xor ($key_,$key_); 806 &mov (&DWP(16,"esp"),$rounds); 807 &mov (&DWP(20,"esp"),$rounds); 808 &mov (&DWP(24,"esp"),$rounds); 809 &mov (&DWP(28,"esp"),$key_); 810 811 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 812 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 813 814 &mov ($rounds,&DWP(240,$key)); # key->rounds 815 816 # compose 2 vectors of 3x32-bit counters 817 &bswap ($rounds_); 818 &pxor ($rndkey1,$rndkey1); 819 &pxor ($rndkey0,$rndkey0); 820 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 821 &pinsrd ($rndkey1,$rounds_,0); 822 &lea ($key_,&DWP(3,$rounds_)); 823 &pinsrd ($rndkey0,$key_,0); 824 &inc ($rounds_); 825 &pinsrd ($rndkey1,$rounds_,1); 826 &inc ($key_); 827 &pinsrd ($rndkey0,$key_,1); 828 &inc ($rounds_); 829 &pinsrd ($rndkey1,$rounds_,2); 830 &inc ($key_); 831 &pinsrd ($rndkey0,$key_,2); 832 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet 833 &pshufb ($rndkey1,$inout0); # byte swap 834 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet 835 &pshufb ($rndkey0,$inout0); # byte swap 836 837 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword 838 &pshufd ($inout1,$rndkey1,2<<6); 839 &cmp ($len,6); 840 &jb (&label("ctr32_tail")); 841 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec 842 &shr ($rounds,1); 843 &mov ($key_,$key); # backup $key 844 &mov ($rounds_,$rounds); # backup $rounds 845 &sub ($len,6); 846 &jmp (&label("ctr32_loop6")); 847 848 &set_label("ctr32_loop6",16); 849 &pshufd ($inout2,$rndkey1,1<<6); 850 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec 851 &pshufd ($inout3,$rndkey0,3<<6); 852 &por ($inout0,$rndkey1); # merge counter-less ivec 853 &pshufd ($inout4,$rndkey0,2<<6); 854 &por ($inout1,$rndkey1); 855 &pshufd ($inout5,$rndkey0,1<<6); 856 &por ($inout2,$rndkey1); 857 &por ($inout3,$rndkey1); 858 &por ($inout4,$rndkey1); 859 &por ($inout5,$rndkey1); 860 861 # inlining _aesni_encrypt6's prologue gives ~4% improvement... 862 &$movekey ($rndkey0,&QWP(0,$key_)); 863 &$movekey ($rndkey1,&QWP(16,$key_)); 864 &lea ($key,&DWP(32,$key_)); 865 &dec ($rounds); 866 &pxor ($inout0,$rndkey0); 867 &pxor ($inout1,$rndkey0); 868 &aesenc ($inout0,$rndkey1); 869 &pxor ($inout2,$rndkey0); 870 &aesenc ($inout1,$rndkey1); 871 &pxor ($inout3,$rndkey0); 872 &aesenc ($inout2,$rndkey1); 873 &pxor ($inout4,$rndkey0); 874 &aesenc ($inout3,$rndkey1); 875 &pxor ($inout5,$rndkey0); 876 &aesenc ($inout4,$rndkey1); 877 &$movekey ($rndkey0,&QWP(0,$key)); 878 &aesenc ($inout5,$rndkey1); 879 880 &call (&label("_aesni_encrypt6_enter")); 881 882 &movups ($rndkey1,&QWP(0,$inp)); 883 &movups ($rndkey0,&QWP(0x10,$inp)); 884 &xorps ($inout0,$rndkey1); 885 &movups ($rndkey1,&QWP(0x20,$inp)); 886 &xorps ($inout1,$rndkey0); 887 &movups (&QWP(0,$out),$inout0); 888 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 889 &xorps ($inout2,$rndkey1); 890 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet 891 &movups (&QWP(0x10,$out),$inout1); 892 &movups (&QWP(0x20,$out),$inout2); 893 894 &paddd ($rndkey1,$rndkey0); # 1st triplet increment 895 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment 896 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 897 898 &movups ($inout1,&QWP(0x30,$inp)); 899 &movups ($inout2,&QWP(0x40,$inp)); 900 &xorps ($inout3,$inout1); 901 &movups ($inout1,&QWP(0x50,$inp)); 902 &lea ($inp,&DWP(0x60,$inp)); 903 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet 904 &pshufb ($rndkey1,$inout0); # byte swap 905 &xorps ($inout4,$inout2); 906 &movups (&QWP(0x30,$out),$inout3); 907 &xorps ($inout5,$inout1); 908 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet 909 &pshufb ($rndkey0,$inout0); # byte swap 910 &movups (&QWP(0x40,$out),$inout4); 911 &pshufd ($inout0,$rndkey1,3<<6); 912 &movups (&QWP(0x50,$out),$inout5); 913 &lea ($out,&DWP(0x60,$out)); 914 915 &mov ($rounds,$rounds_); 916 &pshufd ($inout1,$rndkey1,2<<6); 917 &sub ($len,6); 918 &jnc (&label("ctr32_loop6")); 919 920 &add ($len,6); 921 &jz (&label("ctr32_ret")); 922 &mov ($key,$key_); 923 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 924 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec 925 926 &set_label("ctr32_tail"); 927 &por ($inout0,$inout5); 928 &cmp ($len,2); 929 &jb (&label("ctr32_one")); 930 931 &pshufd ($inout2,$rndkey1,1<<6); 932 &por ($inout1,$inout5); 933 &je (&label("ctr32_two")); 934 935 &pshufd ($inout3,$rndkey0,3<<6); 936 &por ($inout2,$inout5); 937 &cmp ($len,4); 938 &jb (&label("ctr32_three")); 939 940 &pshufd ($inout4,$rndkey0,2<<6); 941 &por ($inout3,$inout5); 942 &je (&label("ctr32_four")); 943 944 &por ($inout4,$inout5); 945 &call ("_aesni_encrypt6"); 946 &movups ($rndkey1,&QWP(0,$inp)); 947 &movups ($rndkey0,&QWP(0x10,$inp)); 948 &xorps ($inout0,$rndkey1); 949 &movups ($rndkey1,&QWP(0x20,$inp)); 950 &xorps ($inout1,$rndkey0); 951 &movups ($rndkey0,&QWP(0x30,$inp)); 952 &xorps ($inout2,$rndkey1); 953 &movups ($rndkey1,&QWP(0x40,$inp)); 954 &xorps ($inout3,$rndkey0); 955 &movups (&QWP(0,$out),$inout0); 956 &xorps ($inout4,$rndkey1); 957 &movups (&QWP(0x10,$out),$inout1); 958 &movups (&QWP(0x20,$out),$inout2); 959 &movups (&QWP(0x30,$out),$inout3); 960 &movups (&QWP(0x40,$out),$inout4); 961 &jmp (&label("ctr32_ret")); 962 963 &set_label("ctr32_one_shortcut",16); 964 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 965 &mov ($rounds,&DWP(240,$key)); 966 967 &set_label("ctr32_one"); 968 if ($inline) 969 { &aesni_inline_generate1("enc"); } 970 else 971 { &call ("_aesni_encrypt1"); } 972 &movups ($in0,&QWP(0,$inp)); 973 &xorps ($in0,$inout0); 974 &movups (&QWP(0,$out),$in0); 975 &jmp (&label("ctr32_ret")); 976 977 &set_label("ctr32_two",16); 978 &call ("_aesni_encrypt3"); 979 &movups ($inout3,&QWP(0,$inp)); 980 &movups ($inout4,&QWP(0x10,$inp)); 981 &xorps ($inout0,$inout3); 982 &xorps ($inout1,$inout4); 983 &movups (&QWP(0,$out),$inout0); 984 &movups (&QWP(0x10,$out),$inout1); 985 &jmp (&label("ctr32_ret")); 986 987 &set_label("ctr32_three",16); 988 &call ("_aesni_encrypt3"); 989 &movups ($inout3,&QWP(0,$inp)); 990 &movups ($inout4,&QWP(0x10,$inp)); 991 &xorps ($inout0,$inout3); 992 &movups ($inout5,&QWP(0x20,$inp)); 993 &xorps ($inout1,$inout4); 994 &movups (&QWP(0,$out),$inout0); 995 &xorps ($inout2,$inout5); 996 &movups (&QWP(0x10,$out),$inout1); 997 &movups (&QWP(0x20,$out),$inout2); 998 &jmp (&label("ctr32_ret")); 999 1000 &set_label("ctr32_four",16); 1001 &call ("_aesni_encrypt4"); 1002 &movups ($inout4,&QWP(0,$inp)); 1003 &movups ($inout5,&QWP(0x10,$inp)); 1004 &movups ($rndkey1,&QWP(0x20,$inp)); 1005 &xorps ($inout0,$inout4); 1006 &movups ($rndkey0,&QWP(0x30,$inp)); 1007 &xorps ($inout1,$inout5); 1008 &movups (&QWP(0,$out),$inout0); 1009 &xorps ($inout2,$rndkey1); 1010 &movups (&QWP(0x10,$out),$inout1); 1011 &xorps ($inout3,$rndkey0); 1012 &movups (&QWP(0x20,$out),$inout2); 1013 &movups (&QWP(0x30,$out),$inout3); 1014 1015 &set_label("ctr32_ret"); 1016 &mov ("esp",&DWP(80,"esp")); 1017 &function_end("aesni_ctr32_encrypt_blocks"); 1018 1020 ###################################################################### 1021 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1022 # const AES_KEY *key1, const AES_KEY *key2 1023 # const unsigned char iv[16]); 1024 # 1025 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1026 1027 &function_begin("aesni_xts_encrypt"); 1028 &mov ($key,&wparam(4)); # key2 1029 &mov ($inp,&wparam(5)); # clear-text tweak 1030 1031 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1032 &movups ($inout0,&QWP(0,$inp)); 1033 if ($inline) 1034 { &aesni_inline_generate1("enc"); } 1035 else 1036 { &call ("_aesni_encrypt1"); } 1037 1038 &mov ($inp,&wparam(0)); 1039 &mov ($out,&wparam(1)); 1040 &mov ($len,&wparam(2)); 1041 &mov ($key,&wparam(3)); # key1 1042 1043 &mov ($key_,"esp"); 1044 &sub ("esp",16*7+8); 1045 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1046 &and ("esp",-16); # align stack 1047 1048 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1049 &mov (&DWP(16*6+4,"esp"),0); 1050 &mov (&DWP(16*6+8,"esp"),1); 1051 &mov (&DWP(16*6+12,"esp"),0); 1052 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1053 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1054 1055 &movdqa ($tweak,$inout0); 1056 &pxor ($twtmp,$twtmp); 1057 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1058 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1059 1060 &and ($len,-16); 1061 &mov ($key_,$key); # backup $key 1062 &mov ($rounds_,$rounds); # backup $rounds 1063 &sub ($len,16*6); 1064 &jc (&label("xts_enc_short")); 1065 1066 &shr ($rounds,1); 1067 &mov ($rounds_,$rounds); 1068 &jmp (&label("xts_enc_loop6")); 1069 1070 &set_label("xts_enc_loop6",16); 1071 for ($i=0;$i<4;$i++) { 1072 &pshufd ($twres,$twtmp,0x13); 1073 &pxor ($twtmp,$twtmp); 1074 &movdqa (&QWP(16*$i,"esp"),$tweak); 1075 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1076 &pand ($twres,$twmask); # isolate carry and residue 1077 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1078 &pxor ($tweak,$twres); 1079 } 1080 &pshufd ($inout5,$twtmp,0x13); 1081 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1082 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1083 &$movekey ($rndkey0,&QWP(0,$key_)); 1084 &pand ($inout5,$twmask); # isolate carry and residue 1085 &movups ($inout0,&QWP(0,$inp)); # load input 1086 &pxor ($inout5,$tweak); 1087 1088 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1089 &movdqu ($inout1,&QWP(16*1,$inp)); 1090 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1091 &movdqu ($inout2,&QWP(16*2,$inp)); 1092 &pxor ($inout1,$rndkey0); 1093 &movdqu ($inout3,&QWP(16*3,$inp)); 1094 &pxor ($inout2,$rndkey0); 1095 &movdqu ($inout4,&QWP(16*4,$inp)); 1096 &pxor ($inout3,$rndkey0); 1097 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1098 &pxor ($inout4,$rndkey0); 1099 &lea ($inp,&DWP(16*6,$inp)); 1100 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1101 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1102 &pxor ($inout5,$rndkey1); 1103 1104 &$movekey ($rndkey1,&QWP(16,$key_)); 1105 &lea ($key,&DWP(32,$key_)); 1106 &pxor ($inout1,&QWP(16*1,"esp")); 1107 &aesenc ($inout0,$rndkey1); 1108 &pxor ($inout2,&QWP(16*2,"esp")); 1109 &aesenc ($inout1,$rndkey1); 1110 &pxor ($inout3,&QWP(16*3,"esp")); 1111 &dec ($rounds); 1112 &aesenc ($inout2,$rndkey1); 1113 &pxor ($inout4,&QWP(16*4,"esp")); 1114 &aesenc ($inout3,$rndkey1); 1115 &pxor ($inout5,$rndkey0); 1116 &aesenc ($inout4,$rndkey1); 1117 &$movekey ($rndkey0,&QWP(0,$key)); 1118 &aesenc ($inout5,$rndkey1); 1119 &call (&label("_aesni_encrypt6_enter")); 1120 1121 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1122 &pxor ($twtmp,$twtmp); 1123 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1124 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1125 &xorps ($inout1,&QWP(16*1,"esp")); 1126 &movups (&QWP(16*0,$out),$inout0); # write output 1127 &xorps ($inout2,&QWP(16*2,"esp")); 1128 &movups (&QWP(16*1,$out),$inout1); 1129 &xorps ($inout3,&QWP(16*3,"esp")); 1130 &movups (&QWP(16*2,$out),$inout2); 1131 &xorps ($inout4,&QWP(16*4,"esp")); 1132 &movups (&QWP(16*3,$out),$inout3); 1133 &xorps ($inout5,$tweak); 1134 &movups (&QWP(16*4,$out),$inout4); 1135 &pshufd ($twres,$twtmp,0x13); 1136 &movups (&QWP(16*5,$out),$inout5); 1137 &lea ($out,&DWP(16*6,$out)); 1138 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1139 1140 &pxor ($twtmp,$twtmp); 1141 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1142 &pand ($twres,$twmask); # isolate carry and residue 1143 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1144 &mov ($rounds,$rounds_); # restore $rounds 1145 &pxor ($tweak,$twres); 1146 1147 &sub ($len,16*6); 1148 &jnc (&label("xts_enc_loop6")); 1149 1150 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 1151 &mov ($key,$key_); # restore $key 1152 &mov ($rounds_,$rounds); 1153 1154 &set_label("xts_enc_short"); 1155 &add ($len,16*6); 1156 &jz (&label("xts_enc_done6x")); 1157 1158 &movdqa ($inout3,$tweak); # put aside previous tweak 1159 &cmp ($len,0x20); 1160 &jb (&label("xts_enc_one")); 1161 1162 &pshufd ($twres,$twtmp,0x13); 1163 &pxor ($twtmp,$twtmp); 1164 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1165 &pand ($twres,$twmask); # isolate carry and residue 1166 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1167 &pxor ($tweak,$twres); 1168 &je (&label("xts_enc_two")); 1169 1170 &pshufd ($twres,$twtmp,0x13); 1171 &pxor ($twtmp,$twtmp); 1172 &movdqa ($inout4,$tweak); # put aside previous tweak 1173 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1174 &pand ($twres,$twmask); # isolate carry and residue 1175 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1176 &pxor ($tweak,$twres); 1177 &cmp ($len,0x40); 1178 &jb (&label("xts_enc_three")); 1179 1180 &pshufd ($twres,$twtmp,0x13); 1181 &pxor ($twtmp,$twtmp); 1182 &movdqa ($inout5,$tweak); # put aside previous tweak 1183 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1184 &pand ($twres,$twmask); # isolate carry and residue 1185 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1186 &pxor ($tweak,$twres); 1187 &movdqa (&QWP(16*0,"esp"),$inout3); 1188 &movdqa (&QWP(16*1,"esp"),$inout4); 1189 &je (&label("xts_enc_four")); 1190 1191 &movdqa (&QWP(16*2,"esp"),$inout5); 1192 &pshufd ($inout5,$twtmp,0x13); 1193 &movdqa (&QWP(16*3,"esp"),$tweak); 1194 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1195 &pand ($inout5,$twmask); # isolate carry and residue 1196 &pxor ($inout5,$tweak); 1197 1198 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1199 &movdqu ($inout1,&QWP(16*1,$inp)); 1200 &movdqu ($inout2,&QWP(16*2,$inp)); 1201 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1202 &movdqu ($inout3,&QWP(16*3,$inp)); 1203 &pxor ($inout1,&QWP(16*1,"esp")); 1204 &movdqu ($inout4,&QWP(16*4,$inp)); 1205 &pxor ($inout2,&QWP(16*2,"esp")); 1206 &lea ($inp,&DWP(16*5,$inp)); 1207 &pxor ($inout3,&QWP(16*3,"esp")); 1208 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1209 &pxor ($inout4,$inout5); 1210 1211 &call ("_aesni_encrypt6"); 1212 1213 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1214 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1215 &xorps ($inout1,&QWP(16*1,"esp")); 1216 &xorps ($inout2,&QWP(16*2,"esp")); 1217 &movups (&QWP(16*0,$out),$inout0); # write output 1218 &xorps ($inout3,&QWP(16*3,"esp")); 1219 &movups (&QWP(16*1,$out),$inout1); 1220 &xorps ($inout4,$tweak); 1221 &movups (&QWP(16*2,$out),$inout2); 1222 &movups (&QWP(16*3,$out),$inout3); 1223 &movups (&QWP(16*4,$out),$inout4); 1224 &lea ($out,&DWP(16*5,$out)); 1225 &jmp (&label("xts_enc_done")); 1226 1227 &set_label("xts_enc_one",16); 1228 &movups ($inout0,&QWP(16*0,$inp)); # load input 1229 &lea ($inp,&DWP(16*1,$inp)); 1230 &xorps ($inout0,$inout3); # input^=tweak 1231 if ($inline) 1232 { &aesni_inline_generate1("enc"); } 1233 else 1234 { &call ("_aesni_encrypt1"); } 1235 &xorps ($inout0,$inout3); # output^=tweak 1236 &movups (&QWP(16*0,$out),$inout0); # write output 1237 &lea ($out,&DWP(16*1,$out)); 1238 1239 &movdqa ($tweak,$inout3); # last tweak 1240 &jmp (&label("xts_enc_done")); 1241 1242 &set_label("xts_enc_two",16); 1243 &movaps ($inout4,$tweak); # put aside last tweak 1244 1245 &movups ($inout0,&QWP(16*0,$inp)); # load input 1246 &movups ($inout1,&QWP(16*1,$inp)); 1247 &lea ($inp,&DWP(16*2,$inp)); 1248 &xorps ($inout0,$inout3); # input^=tweak 1249 &xorps ($inout1,$inout4); 1250 &xorps ($inout2,$inout2); 1251 1252 &call ("_aesni_encrypt3"); 1253 1254 &xorps ($inout0,$inout3); # output^=tweak 1255 &xorps ($inout1,$inout4); 1256 &movups (&QWP(16*0,$out),$inout0); # write output 1257 &movups (&QWP(16*1,$out),$inout1); 1258 &lea ($out,&DWP(16*2,$out)); 1259 1260 &movdqa ($tweak,$inout4); # last tweak 1261 &jmp (&label("xts_enc_done")); 1262 1263 &set_label("xts_enc_three",16); 1264 &movaps ($inout5,$tweak); # put aside last tweak 1265 &movups ($inout0,&QWP(16*0,$inp)); # load input 1266 &movups ($inout1,&QWP(16*1,$inp)); 1267 &movups ($inout2,&QWP(16*2,$inp)); 1268 &lea ($inp,&DWP(16*3,$inp)); 1269 &xorps ($inout0,$inout3); # input^=tweak 1270 &xorps ($inout1,$inout4); 1271 &xorps ($inout2,$inout5); 1272 1273 &call ("_aesni_encrypt3"); 1274 1275 &xorps ($inout0,$inout3); # output^=tweak 1276 &xorps ($inout1,$inout4); 1277 &xorps ($inout2,$inout5); 1278 &movups (&QWP(16*0,$out),$inout0); # write output 1279 &movups (&QWP(16*1,$out),$inout1); 1280 &movups (&QWP(16*2,$out),$inout2); 1281 &lea ($out,&DWP(16*3,$out)); 1282 1283 &movdqa ($tweak,$inout5); # last tweak 1284 &jmp (&label("xts_enc_done")); 1285 1286 &set_label("xts_enc_four",16); 1287 &movaps ($inout4,$tweak); # put aside last tweak 1288 1289 &movups ($inout0,&QWP(16*0,$inp)); # load input 1290 &movups ($inout1,&QWP(16*1,$inp)); 1291 &movups ($inout2,&QWP(16*2,$inp)); 1292 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1293 &movups ($inout3,&QWP(16*3,$inp)); 1294 &lea ($inp,&DWP(16*4,$inp)); 1295 &xorps ($inout1,&QWP(16*1,"esp")); 1296 &xorps ($inout2,$inout5); 1297 &xorps ($inout3,$inout4); 1298 1299 &call ("_aesni_encrypt4"); 1300 1301 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1302 &xorps ($inout1,&QWP(16*1,"esp")); 1303 &xorps ($inout2,$inout5); 1304 &movups (&QWP(16*0,$out),$inout0); # write output 1305 &xorps ($inout3,$inout4); 1306 &movups (&QWP(16*1,$out),$inout1); 1307 &movups (&QWP(16*2,$out),$inout2); 1308 &movups (&QWP(16*3,$out),$inout3); 1309 &lea ($out,&DWP(16*4,$out)); 1310 1311 &movdqa ($tweak,$inout4); # last tweak 1312 &jmp (&label("xts_enc_done")); 1313 1314 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1315 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1316 &and ($len,15); 1317 &jz (&label("xts_enc_ret")); 1318 &movdqa ($inout3,$tweak); 1319 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1320 &jmp (&label("xts_enc_steal")); 1321 1322 &set_label("xts_enc_done",16); 1323 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1324 &pxor ($twtmp,$twtmp); 1325 &and ($len,15); 1326 &jz (&label("xts_enc_ret")); 1327 1328 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1329 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1330 &pshufd ($inout3,$twtmp,0x13); 1331 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1332 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1333 &pxor ($inout3,$tweak); 1334 1335 &set_label("xts_enc_steal"); 1336 &movz ($rounds,&BP(0,$inp)); 1337 &movz ($key,&BP(-16,$out)); 1338 &lea ($inp,&DWP(1,$inp)); 1339 &mov (&BP(-16,$out),&LB($rounds)); 1340 &mov (&BP(0,$out),&LB($key)); 1341 &lea ($out,&DWP(1,$out)); 1342 &sub ($len,1); 1343 &jnz (&label("xts_enc_steal")); 1344 1345 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1346 &mov ($key,$key_); # restore $key 1347 &mov ($rounds,$rounds_); # restore $rounds 1348 1349 &movups ($inout0,&QWP(-16,$out)); # load input 1350 &xorps ($inout0,$inout3); # input^=tweak 1351 if ($inline) 1352 { &aesni_inline_generate1("enc"); } 1353 else 1354 { &call ("_aesni_encrypt1"); } 1355 &xorps ($inout0,$inout3); # output^=tweak 1356 &movups (&QWP(-16,$out),$inout0); # write output 1357 1358 &set_label("xts_enc_ret"); 1359 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1360 &function_end("aesni_xts_encrypt"); 1361 1362 &function_begin("aesni_xts_decrypt"); 1363 &mov ($key,&wparam(4)); # key2 1364 &mov ($inp,&wparam(5)); # clear-text tweak 1365 1366 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1367 &movups ($inout0,&QWP(0,$inp)); 1368 if ($inline) 1369 { &aesni_inline_generate1("enc"); } 1370 else 1371 { &call ("_aesni_encrypt1"); } 1372 1373 &mov ($inp,&wparam(0)); 1374 &mov ($out,&wparam(1)); 1375 &mov ($len,&wparam(2)); 1376 &mov ($key,&wparam(3)); # key1 1377 1378 &mov ($key_,"esp"); 1379 &sub ("esp",16*7+8); 1380 &and ("esp",-16); # align stack 1381 1382 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1383 &test ($len,15); 1384 &setnz (&LB($rounds_)); 1385 &shl ($rounds_,4); 1386 &sub ($len,$rounds_); 1387 1388 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1389 &mov (&DWP(16*6+4,"esp"),0); 1390 &mov (&DWP(16*6+8,"esp"),1); 1391 &mov (&DWP(16*6+12,"esp"),0); 1392 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1393 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1394 1395 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1396 &mov ($key_,$key); # backup $key 1397 &mov ($rounds_,$rounds); # backup $rounds 1398 1399 &movdqa ($tweak,$inout0); 1400 &pxor ($twtmp,$twtmp); 1401 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1402 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1403 1404 &and ($len,-16); 1405 &sub ($len,16*6); 1406 &jc (&label("xts_dec_short")); 1407 1408 &shr ($rounds,1); 1409 &mov ($rounds_,$rounds); 1410 &jmp (&label("xts_dec_loop6")); 1411 1412 &set_label("xts_dec_loop6",16); 1413 for ($i=0;$i<4;$i++) { 1414 &pshufd ($twres,$twtmp,0x13); 1415 &pxor ($twtmp,$twtmp); 1416 &movdqa (&QWP(16*$i,"esp"),$tweak); 1417 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1418 &pand ($twres,$twmask); # isolate carry and residue 1419 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1420 &pxor ($tweak,$twres); 1421 } 1422 &pshufd ($inout5,$twtmp,0x13); 1423 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1424 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1425 &$movekey ($rndkey0,&QWP(0,$key_)); 1426 &pand ($inout5,$twmask); # isolate carry and residue 1427 &movups ($inout0,&QWP(0,$inp)); # load input 1428 &pxor ($inout5,$tweak); 1429 1430 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1431 &movdqu ($inout1,&QWP(16*1,$inp)); 1432 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1433 &movdqu ($inout2,&QWP(16*2,$inp)); 1434 &pxor ($inout1,$rndkey0); 1435 &movdqu ($inout3,&QWP(16*3,$inp)); 1436 &pxor ($inout2,$rndkey0); 1437 &movdqu ($inout4,&QWP(16*4,$inp)); 1438 &pxor ($inout3,$rndkey0); 1439 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1440 &pxor ($inout4,$rndkey0); 1441 &lea ($inp,&DWP(16*6,$inp)); 1442 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1443 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1444 &pxor ($inout5,$rndkey1); 1445 1446 &$movekey ($rndkey1,&QWP(16,$key_)); 1447 &lea ($key,&DWP(32,$key_)); 1448 &pxor ($inout1,&QWP(16*1,"esp")); 1449 &aesdec ($inout0,$rndkey1); 1450 &pxor ($inout2,&QWP(16*2,"esp")); 1451 &aesdec ($inout1,$rndkey1); 1452 &pxor ($inout3,&QWP(16*3,"esp")); 1453 &dec ($rounds); 1454 &aesdec ($inout2,$rndkey1); 1455 &pxor ($inout4,&QWP(16*4,"esp")); 1456 &aesdec ($inout3,$rndkey1); 1457 &pxor ($inout5,$rndkey0); 1458 &aesdec ($inout4,$rndkey1); 1459 &$movekey ($rndkey0,&QWP(0,$key)); 1460 &aesdec ($inout5,$rndkey1); 1461 &call (&label("_aesni_decrypt6_enter")); 1462 1463 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1464 &pxor ($twtmp,$twtmp); 1465 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1466 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1467 &xorps ($inout1,&QWP(16*1,"esp")); 1468 &movups (&QWP(16*0,$out),$inout0); # write output 1469 &xorps ($inout2,&QWP(16*2,"esp")); 1470 &movups (&QWP(16*1,$out),$inout1); 1471 &xorps ($inout3,&QWP(16*3,"esp")); 1472 &movups (&QWP(16*2,$out),$inout2); 1473 &xorps ($inout4,&QWP(16*4,"esp")); 1474 &movups (&QWP(16*3,$out),$inout3); 1475 &xorps ($inout5,$tweak); 1476 &movups (&QWP(16*4,$out),$inout4); 1477 &pshufd ($twres,$twtmp,0x13); 1478 &movups (&QWP(16*5,$out),$inout5); 1479 &lea ($out,&DWP(16*6,$out)); 1480 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1481 1482 &pxor ($twtmp,$twtmp); 1483 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1484 &pand ($twres,$twmask); # isolate carry and residue 1485 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1486 &mov ($rounds,$rounds_); # restore $rounds 1487 &pxor ($tweak,$twres); 1488 1489 &sub ($len,16*6); 1490 &jnc (&label("xts_dec_loop6")); 1491 1492 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 1493 &mov ($key,$key_); # restore $key 1494 &mov ($rounds_,$rounds); 1495 1496 &set_label("xts_dec_short"); 1497 &add ($len,16*6); 1498 &jz (&label("xts_dec_done6x")); 1499 1500 &movdqa ($inout3,$tweak); # put aside previous tweak 1501 &cmp ($len,0x20); 1502 &jb (&label("xts_dec_one")); 1503 1504 &pshufd ($twres,$twtmp,0x13); 1505 &pxor ($twtmp,$twtmp); 1506 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1507 &pand ($twres,$twmask); # isolate carry and residue 1508 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1509 &pxor ($tweak,$twres); 1510 &je (&label("xts_dec_two")); 1511 1512 &pshufd ($twres,$twtmp,0x13); 1513 &pxor ($twtmp,$twtmp); 1514 &movdqa ($inout4,$tweak); # put aside previous tweak 1515 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1516 &pand ($twres,$twmask); # isolate carry and residue 1517 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1518 &pxor ($tweak,$twres); 1519 &cmp ($len,0x40); 1520 &jb (&label("xts_dec_three")); 1521 1522 &pshufd ($twres,$twtmp,0x13); 1523 &pxor ($twtmp,$twtmp); 1524 &movdqa ($inout5,$tweak); # put aside previous tweak 1525 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1526 &pand ($twres,$twmask); # isolate carry and residue 1527 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1528 &pxor ($tweak,$twres); 1529 &movdqa (&QWP(16*0,"esp"),$inout3); 1530 &movdqa (&QWP(16*1,"esp"),$inout4); 1531 &je (&label("xts_dec_four")); 1532 1533 &movdqa (&QWP(16*2,"esp"),$inout5); 1534 &pshufd ($inout5,$twtmp,0x13); 1535 &movdqa (&QWP(16*3,"esp"),$tweak); 1536 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1537 &pand ($inout5,$twmask); # isolate carry and residue 1538 &pxor ($inout5,$tweak); 1539 1540 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1541 &movdqu ($inout1,&QWP(16*1,$inp)); 1542 &movdqu ($inout2,&QWP(16*2,$inp)); 1543 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1544 &movdqu ($inout3,&QWP(16*3,$inp)); 1545 &pxor ($inout1,&QWP(16*1,"esp")); 1546 &movdqu ($inout4,&QWP(16*4,$inp)); 1547 &pxor ($inout2,&QWP(16*2,"esp")); 1548 &lea ($inp,&DWP(16*5,$inp)); 1549 &pxor ($inout3,&QWP(16*3,"esp")); 1550 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1551 &pxor ($inout4,$inout5); 1552 1553 &call ("_aesni_decrypt6"); 1554 1555 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1556 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1557 &xorps ($inout1,&QWP(16*1,"esp")); 1558 &xorps ($inout2,&QWP(16*2,"esp")); 1559 &movups (&QWP(16*0,$out),$inout0); # write output 1560 &xorps ($inout3,&QWP(16*3,"esp")); 1561 &movups (&QWP(16*1,$out),$inout1); 1562 &xorps ($inout4,$tweak); 1563 &movups (&QWP(16*2,$out),$inout2); 1564 &movups (&QWP(16*3,$out),$inout3); 1565 &movups (&QWP(16*4,$out),$inout4); 1566 &lea ($out,&DWP(16*5,$out)); 1567 &jmp (&label("xts_dec_done")); 1568 1569 &set_label("xts_dec_one",16); 1570 &movups ($inout0,&QWP(16*0,$inp)); # load input 1571 &lea ($inp,&DWP(16*1,$inp)); 1572 &xorps ($inout0,$inout3); # input^=tweak 1573 if ($inline) 1574 { &aesni_inline_generate1("dec"); } 1575 else 1576 { &call ("_aesni_decrypt1"); } 1577 &xorps ($inout0,$inout3); # output^=tweak 1578 &movups (&QWP(16*0,$out),$inout0); # write output 1579 &lea ($out,&DWP(16*1,$out)); 1580 1581 &movdqa ($tweak,$inout3); # last tweak 1582 &jmp (&label("xts_dec_done")); 1583 1584 &set_label("xts_dec_two",16); 1585 &movaps ($inout4,$tweak); # put aside last tweak 1586 1587 &movups ($inout0,&QWP(16*0,$inp)); # load input 1588 &movups ($inout1,&QWP(16*1,$inp)); 1589 &lea ($inp,&DWP(16*2,$inp)); 1590 &xorps ($inout0,$inout3); # input^=tweak 1591 &xorps ($inout1,$inout4); 1592 1593 &call ("_aesni_decrypt3"); 1594 1595 &xorps ($inout0,$inout3); # output^=tweak 1596 &xorps ($inout1,$inout4); 1597 &movups (&QWP(16*0,$out),$inout0); # write output 1598 &movups (&QWP(16*1,$out),$inout1); 1599 &lea ($out,&DWP(16*2,$out)); 1600 1601 &movdqa ($tweak,$inout4); # last tweak 1602 &jmp (&label("xts_dec_done")); 1603 1604 &set_label("xts_dec_three",16); 1605 &movaps ($inout5,$tweak); # put aside last tweak 1606 &movups ($inout0,&QWP(16*0,$inp)); # load input 1607 &movups ($inout1,&QWP(16*1,$inp)); 1608 &movups ($inout2,&QWP(16*2,$inp)); 1609 &lea ($inp,&DWP(16*3,$inp)); 1610 &xorps ($inout0,$inout3); # input^=tweak 1611 &xorps ($inout1,$inout4); 1612 &xorps ($inout2,$inout5); 1613 1614 &call ("_aesni_decrypt3"); 1615 1616 &xorps ($inout0,$inout3); # output^=tweak 1617 &xorps ($inout1,$inout4); 1618 &xorps ($inout2,$inout5); 1619 &movups (&QWP(16*0,$out),$inout0); # write output 1620 &movups (&QWP(16*1,$out),$inout1); 1621 &movups (&QWP(16*2,$out),$inout2); 1622 &lea ($out,&DWP(16*3,$out)); 1623 1624 &movdqa ($tweak,$inout5); # last tweak 1625 &jmp (&label("xts_dec_done")); 1626 1627 &set_label("xts_dec_four",16); 1628 &movaps ($inout4,$tweak); # put aside last tweak 1629 1630 &movups ($inout0,&QWP(16*0,$inp)); # load input 1631 &movups ($inout1,&QWP(16*1,$inp)); 1632 &movups ($inout2,&QWP(16*2,$inp)); 1633 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1634 &movups ($inout3,&QWP(16*3,$inp)); 1635 &lea ($inp,&DWP(16*4,$inp)); 1636 &xorps ($inout1,&QWP(16*1,"esp")); 1637 &xorps ($inout2,$inout5); 1638 &xorps ($inout3,$inout4); 1639 1640 &call ("_aesni_decrypt4"); 1641 1642 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1643 &xorps ($inout1,&QWP(16*1,"esp")); 1644 &xorps ($inout2,$inout5); 1645 &movups (&QWP(16*0,$out),$inout0); # write output 1646 &xorps ($inout3,$inout4); 1647 &movups (&QWP(16*1,$out),$inout1); 1648 &movups (&QWP(16*2,$out),$inout2); 1649 &movups (&QWP(16*3,$out),$inout3); 1650 &lea ($out,&DWP(16*4,$out)); 1651 1652 &movdqa ($tweak,$inout4); # last tweak 1653 &jmp (&label("xts_dec_done")); 1654 1655 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1656 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1657 &and ($len,15); 1658 &jz (&label("xts_dec_ret")); 1659 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1660 &jmp (&label("xts_dec_only_one_more")); 1661 1662 &set_label("xts_dec_done",16); 1663 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1664 &pxor ($twtmp,$twtmp); 1665 &and ($len,15); 1666 &jz (&label("xts_dec_ret")); 1667 1668 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1669 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1670 &pshufd ($twres,$twtmp,0x13); 1671 &pxor ($twtmp,$twtmp); 1672 &movdqa ($twmask,&QWP(16*6,"esp")); 1673 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1674 &pand ($twres,$twmask); # isolate carry and residue 1675 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1676 &pxor ($tweak,$twres); 1677 1678 &set_label("xts_dec_only_one_more"); 1679 &pshufd ($inout3,$twtmp,0x13); 1680 &movdqa ($inout4,$tweak); # put aside previous tweak 1681 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1682 &pand ($inout3,$twmask); # isolate carry and residue 1683 &pxor ($inout3,$tweak); 1684 1685 &mov ($key,$key_); # restore $key 1686 &mov ($rounds,$rounds_); # restore $rounds 1687 1688 &movups ($inout0,&QWP(0,$inp)); # load input 1689 &xorps ($inout0,$inout3); # input^=tweak 1690 if ($inline) 1691 { &aesni_inline_generate1("dec"); } 1692 else 1693 { &call ("_aesni_decrypt1"); } 1694 &xorps ($inout0,$inout3); # output^=tweak 1695 &movups (&QWP(0,$out),$inout0); # write output 1696 1697 &set_label("xts_dec_steal"); 1698 &movz ($rounds,&BP(16,$inp)); 1699 &movz ($key,&BP(0,$out)); 1700 &lea ($inp,&DWP(1,$inp)); 1701 &mov (&BP(0,$out),&LB($rounds)); 1702 &mov (&BP(16,$out),&LB($key)); 1703 &lea ($out,&DWP(1,$out)); 1704 &sub ($len,1); 1705 &jnz (&label("xts_dec_steal")); 1706 1707 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1708 &mov ($key,$key_); # restore $key 1709 &mov ($rounds,$rounds_); # restore $rounds 1710 1711 &movups ($inout0,&QWP(0,$out)); # load input 1712 &xorps ($inout0,$inout4); # input^=tweak 1713 if ($inline) 1714 { &aesni_inline_generate1("dec"); } 1715 else 1716 { &call ("_aesni_decrypt1"); } 1717 &xorps ($inout0,$inout4); # output^=tweak 1718 &movups (&QWP(0,$out),$inout0); # write output 1719 1720 &set_label("xts_dec_ret"); 1721 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1722 &function_end("aesni_xts_decrypt"); 1723 } 1724 } 1725 1727 ###################################################################### 1728 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 1729 # size_t length, const AES_KEY *key, 1730 # unsigned char *ivp,const int enc); 1731 &function_begin("${PREFIX}_cbc_encrypt"); 1732 &mov ($inp,&wparam(0)); 1733 &mov ($rounds_,"esp"); 1734 &mov ($out,&wparam(1)); 1735 &sub ($rounds_,24); 1736 &mov ($len,&wparam(2)); 1737 &and ($rounds_,-16); 1738 &mov ($key,&wparam(3)); 1739 &mov ($key_,&wparam(4)); 1740 &test ($len,$len); 1741 &jz (&label("cbc_abort")); 1742 1743 &cmp (&wparam(5),0); 1744 &xchg ($rounds_,"esp"); # alloca 1745 &movups ($ivec,&QWP(0,$key_)); # load IV 1746 &mov ($rounds,&DWP(240,$key)); 1747 &mov ($key_,$key); # backup $key 1748 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1749 &mov ($rounds_,$rounds); # backup $rounds 1750 &je (&label("cbc_decrypt")); 1751 1752 &movaps ($inout0,$ivec); 1753 &cmp ($len,16); 1754 &jb (&label("cbc_enc_tail")); 1755 &sub ($len,16); 1756 &jmp (&label("cbc_enc_loop")); 1757 1758 &set_label("cbc_enc_loop",16); 1759 &movups ($ivec,&QWP(0,$inp)); # input actually 1760 &lea ($inp,&DWP(16,$inp)); 1761 if ($inline) 1762 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1763 else 1764 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1765 &mov ($rounds,$rounds_); # restore $rounds 1766 &mov ($key,$key_); # restore $key 1767 &movups (&QWP(0,$out),$inout0); # store output 1768 &lea ($out,&DWP(16,$out)); 1769 &sub ($len,16); 1770 &jnc (&label("cbc_enc_loop")); 1771 &add ($len,16); 1772 &jnz (&label("cbc_enc_tail")); 1773 &movaps ($ivec,$inout0); 1774 &jmp (&label("cbc_ret")); 1775 1776 &set_label("cbc_enc_tail"); 1777 &mov ("ecx",$len); # zaps $rounds 1778 &data_word(0xA4F3F689); # rep movsb 1779 &mov ("ecx",16); # zero tail 1780 &sub ("ecx",$len); 1781 &xor ("eax","eax"); # zaps $len 1782 &data_word(0xAAF3F689); # rep stosb 1783 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1784 &mov ($rounds,$rounds_); # restore $rounds 1785 &mov ($inp,$out); # $inp and $out are the same 1786 &mov ($key,$key_); # restore $key 1787 &jmp (&label("cbc_enc_loop")); 1788 ###################################################################### 1789 &set_label("cbc_decrypt",16); 1790 &cmp ($len,0x50); 1791 &jbe (&label("cbc_dec_tail")); 1792 &movaps (&QWP(0,"esp"),$ivec); # save IV 1793 &sub ($len,0x50); 1794 &jmp (&label("cbc_dec_loop6_enter")); 1795 1796 &set_label("cbc_dec_loop6",16); 1797 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1798 &movups (&QWP(0,$out),$inout5); 1799 &lea ($out,&DWP(0x10,$out)); 1800 &set_label("cbc_dec_loop6_enter"); 1801 &movdqu ($inout0,&QWP(0,$inp)); 1802 &movdqu ($inout1,&QWP(0x10,$inp)); 1803 &movdqu ($inout2,&QWP(0x20,$inp)); 1804 &movdqu ($inout3,&QWP(0x30,$inp)); 1805 &movdqu ($inout4,&QWP(0x40,$inp)); 1806 &movdqu ($inout5,&QWP(0x50,$inp)); 1807 1808 &call ("_aesni_decrypt6"); 1809 1810 &movups ($rndkey1,&QWP(0,$inp)); 1811 &movups ($rndkey0,&QWP(0x10,$inp)); 1812 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1813 &xorps ($inout1,$rndkey1); 1814 &movups ($rndkey1,&QWP(0x20,$inp)); 1815 &xorps ($inout2,$rndkey0); 1816 &movups ($rndkey0,&QWP(0x30,$inp)); 1817 &xorps ($inout3,$rndkey1); 1818 &movups ($rndkey1,&QWP(0x40,$inp)); 1819 &xorps ($inout4,$rndkey0); 1820 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1821 &xorps ($inout5,$rndkey1); 1822 &movups (&QWP(0,$out),$inout0); 1823 &movups (&QWP(0x10,$out),$inout1); 1824 &lea ($inp,&DWP(0x60,$inp)); 1825 &movups (&QWP(0x20,$out),$inout2); 1826 &mov ($rounds,$rounds_) # restore $rounds 1827 &movups (&QWP(0x30,$out),$inout3); 1828 &mov ($key,$key_); # restore $key 1829 &movups (&QWP(0x40,$out),$inout4); 1830 &lea ($out,&DWP(0x50,$out)); 1831 &sub ($len,0x60); 1832 &ja (&label("cbc_dec_loop6")); 1833 1834 &movaps ($inout0,$inout5); 1835 &movaps ($ivec,$rndkey0); 1836 &add ($len,0x50); 1837 &jle (&label("cbc_dec_tail_collected")); 1838 &movups (&QWP(0,$out),$inout0); 1839 &lea ($out,&DWP(0x10,$out)); 1840 &set_label("cbc_dec_tail"); 1841 &movups ($inout0,&QWP(0,$inp)); 1842 &movaps ($in0,$inout0); 1843 &cmp ($len,0x10); 1844 &jbe (&label("cbc_dec_one")); 1845 1846 &movups ($inout1,&QWP(0x10,$inp)); 1847 &movaps ($in1,$inout1); 1848 &cmp ($len,0x20); 1849 &jbe (&label("cbc_dec_two")); 1850 1851 &movups ($inout2,&QWP(0x20,$inp)); 1852 &cmp ($len,0x30); 1853 &jbe (&label("cbc_dec_three")); 1854 1855 &movups ($inout3,&QWP(0x30,$inp)); 1856 &cmp ($len,0x40); 1857 &jbe (&label("cbc_dec_four")); 1858 1859 &movups ($inout4,&QWP(0x40,$inp)); 1860 &movaps (&QWP(0,"esp"),$ivec); # save IV 1861 &movups ($inout0,&QWP(0,$inp)); 1862 &xorps ($inout5,$inout5); 1863 &call ("_aesni_decrypt6"); 1864 &movups ($rndkey1,&QWP(0,$inp)); 1865 &movups ($rndkey0,&QWP(0x10,$inp)); 1866 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1867 &xorps ($inout1,$rndkey1); 1868 &movups ($rndkey1,&QWP(0x20,$inp)); 1869 &xorps ($inout2,$rndkey0); 1870 &movups ($rndkey0,&QWP(0x30,$inp)); 1871 &xorps ($inout3,$rndkey1); 1872 &movups ($ivec,&QWP(0x40,$inp)); # IV 1873 &xorps ($inout4,$rndkey0); 1874 &movups (&QWP(0,$out),$inout0); 1875 &movups (&QWP(0x10,$out),$inout1); 1876 &movups (&QWP(0x20,$out),$inout2); 1877 &movups (&QWP(0x30,$out),$inout3); 1878 &lea ($out,&DWP(0x40,$out)); 1879 &movaps ($inout0,$inout4); 1880 &sub ($len,0x50); 1881 &jmp (&label("cbc_dec_tail_collected")); 1882 1883 &set_label("cbc_dec_one",16); 1884 if ($inline) 1885 { &aesni_inline_generate1("dec"); } 1886 else 1887 { &call ("_aesni_decrypt1"); } 1888 &xorps ($inout0,$ivec); 1889 &movaps ($ivec,$in0); 1890 &sub ($len,0x10); 1891 &jmp (&label("cbc_dec_tail_collected")); 1892 1893 &set_label("cbc_dec_two",16); 1894 &xorps ($inout2,$inout2); 1895 &call ("_aesni_decrypt3"); 1896 &xorps ($inout0,$ivec); 1897 &xorps ($inout1,$in0); 1898 &movups (&QWP(0,$out),$inout0); 1899 &movaps ($inout0,$inout1); 1900 &lea ($out,&DWP(0x10,$out)); 1901 &movaps ($ivec,$in1); 1902 &sub ($len,0x20); 1903 &jmp (&label("cbc_dec_tail_collected")); 1904 1905 &set_label("cbc_dec_three",16); 1906 &call ("_aesni_decrypt3"); 1907 &xorps ($inout0,$ivec); 1908 &xorps ($inout1,$in0); 1909 &xorps ($inout2,$in1); 1910 &movups (&QWP(0,$out),$inout0); 1911 &movaps ($inout0,$inout2); 1912 &movups (&QWP(0x10,$out),$inout1); 1913 &lea ($out,&DWP(0x20,$out)); 1914 &movups ($ivec,&QWP(0x20,$inp)); 1915 &sub ($len,0x30); 1916 &jmp (&label("cbc_dec_tail_collected")); 1917 1918 &set_label("cbc_dec_four",16); 1919 &call ("_aesni_decrypt4"); 1920 &movups ($rndkey1,&QWP(0x10,$inp)); 1921 &movups ($rndkey0,&QWP(0x20,$inp)); 1922 &xorps ($inout0,$ivec); 1923 &movups ($ivec,&QWP(0x30,$inp)); 1924 &xorps ($inout1,$in0); 1925 &movups (&QWP(0,$out),$inout0); 1926 &xorps ($inout2,$rndkey1); 1927 &movups (&QWP(0x10,$out),$inout1); 1928 &xorps ($inout3,$rndkey0); 1929 &movups (&QWP(0x20,$out),$inout2); 1930 &lea ($out,&DWP(0x30,$out)); 1931 &movaps ($inout0,$inout3); 1932 &sub ($len,0x40); 1933 1934 &set_label("cbc_dec_tail_collected"); 1935 &and ($len,15); 1936 &jnz (&label("cbc_dec_tail_partial")); 1937 &movups (&QWP(0,$out),$inout0); 1938 &jmp (&label("cbc_ret")); 1939 1940 &set_label("cbc_dec_tail_partial",16); 1941 &movaps (&QWP(0,"esp"),$inout0); 1942 &mov ("ecx",16); 1943 &mov ($inp,"esp"); 1944 &sub ("ecx",$len); 1945 &data_word(0xA4F3F689); # rep movsb 1946 1947 &set_label("cbc_ret"); 1948 &mov ("esp",&DWP(16,"esp")); # pull original %esp 1949 &mov ($key_,&wparam(4)); 1950 &movups (&QWP(0,$key_),$ivec); # output IV 1951 &set_label("cbc_abort"); 1952 &function_end("${PREFIX}_cbc_encrypt"); 1953 1955 ###################################################################### 1956 # Mechanical port from aesni-x86_64.pl. 1957 # 1958 # _aesni_set_encrypt_key is private interface, 1959 # input: 1960 # "eax" const unsigned char *userKey 1961 # $rounds int bits 1962 # $key AES_KEY *key 1963 # output: 1964 # "eax" return code 1965 # $round rounds 1966 1967 &function_begin_B("_aesni_set_encrypt_key"); 1968 &test ("eax","eax"); 1969 &jz (&label("bad_pointer")); 1970 &test ($key,$key); 1971 &jz (&label("bad_pointer")); 1972 1973 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 1974 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 1975 &lea ($key,&DWP(16,$key)); 1976 &cmp ($rounds,256); 1977 &je (&label("14rounds")); 1978 &cmp ($rounds,192); 1979 &je (&label("12rounds")); 1980 &cmp ($rounds,128); 1981 &jne (&label("bad_keybits")); 1982 1983 &set_label("10rounds",16); 1984 &mov ($rounds,9); 1985 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 1986 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 1987 &call (&label("key_128_cold")); 1988 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 1989 &call (&label("key_128")); 1990 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 1991 &call (&label("key_128")); 1992 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 1993 &call (&label("key_128")); 1994 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 1995 &call (&label("key_128")); 1996 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 1997 &call (&label("key_128")); 1998 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 1999 &call (&label("key_128")); 2000 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2001 &call (&label("key_128")); 2002 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2003 &call (&label("key_128")); 2004 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2005 &call (&label("key_128")); 2006 &$movekey (&QWP(0,$key),"xmm0"); 2007 &mov (&DWP(80,$key),$rounds); 2008 &xor ("eax","eax"); 2009 &ret(); 2010 2011 &set_label("key_128",16); 2012 &$movekey (&QWP(0,$key),"xmm0"); 2013 &lea ($key,&DWP(16,$key)); 2014 &set_label("key_128_cold"); 2015 &shufps ("xmm4","xmm0",0b00010000); 2016 &xorps ("xmm0","xmm4"); 2017 &shufps ("xmm4","xmm0",0b10001100); 2018 &xorps ("xmm0","xmm4"); 2019 &shufps ("xmm1","xmm1",0b11111111); # critical path 2020 &xorps ("xmm0","xmm1"); 2021 &ret(); 2022 2023 &set_label("12rounds",16); 2024 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2025 &mov ($rounds,11); 2026 &$movekey (&QWP(-16,$key),"xmm0") # round 0 2027 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2028 &call (&label("key_192a_cold")); 2029 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2030 &call (&label("key_192b")); 2031 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2032 &call (&label("key_192a")); 2033 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2034 &call (&label("key_192b")); 2035 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2036 &call (&label("key_192a")); 2037 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2038 &call (&label("key_192b")); 2039 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2040 &call (&label("key_192a")); 2041 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2042 &call (&label("key_192b")); 2043 &$movekey (&QWP(0,$key),"xmm0"); 2044 &mov (&DWP(48,$key),$rounds); 2045 &xor ("eax","eax"); 2046 &ret(); 2047 2048 &set_label("key_192a",16); 2049 &$movekey (&QWP(0,$key),"xmm0"); 2050 &lea ($key,&DWP(16,$key)); 2051 &set_label("key_192a_cold",16); 2052 &movaps ("xmm5","xmm2"); 2053 &set_label("key_192b_warm"); 2054 &shufps ("xmm4","xmm0",0b00010000); 2055 &movdqa ("xmm3","xmm2"); 2056 &xorps ("xmm0","xmm4"); 2057 &shufps ("xmm4","xmm0",0b10001100); 2058 &pslldq ("xmm3",4); 2059 &xorps ("xmm0","xmm4"); 2060 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2061 &pxor ("xmm2","xmm3"); 2062 &pxor ("xmm0","xmm1"); 2063 &pshufd ("xmm3","xmm0",0b11111111); 2064 &pxor ("xmm2","xmm3"); 2065 &ret(); 2066 2067 &set_label("key_192b",16); 2068 &movaps ("xmm3","xmm0"); 2069 &shufps ("xmm5","xmm0",0b01000100); 2070 &$movekey (&QWP(0,$key),"xmm5"); 2071 &shufps ("xmm3","xmm2",0b01001110); 2072 &$movekey (&QWP(16,$key),"xmm3"); 2073 &lea ($key,&DWP(32,$key)); 2074 &jmp (&label("key_192b_warm")); 2075 2076 &set_label("14rounds",16); 2077 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2078 &mov ($rounds,13); 2079 &lea ($key,&DWP(16,$key)); 2080 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2081 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2082 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2083 &call (&label("key_256a_cold")); 2084 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2085 &call (&label("key_256b")); 2086 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2087 &call (&label("key_256a")); 2088 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2089 &call (&label("key_256b")); 2090 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2091 &call (&label("key_256a")); 2092 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2093 &call (&label("key_256b")); 2094 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2095 &call (&label("key_256a")); 2096 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2097 &call (&label("key_256b")); 2098 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2099 &call (&label("key_256a")); 2100 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2101 &call (&label("key_256b")); 2102 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2103 &call (&label("key_256a")); 2104 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2105 &call (&label("key_256b")); 2106 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2107 &call (&label("key_256a")); 2108 &$movekey (&QWP(0,$key),"xmm0"); 2109 &mov (&DWP(16,$key),$rounds); 2110 &xor ("eax","eax"); 2111 &ret(); 2112 2113 &set_label("key_256a",16); 2114 &$movekey (&QWP(0,$key),"xmm2"); 2115 &lea ($key,&DWP(16,$key)); 2116 &set_label("key_256a_cold"); 2117 &shufps ("xmm4","xmm0",0b00010000); 2118 &xorps ("xmm0","xmm4"); 2119 &shufps ("xmm4","xmm0",0b10001100); 2120 &xorps ("xmm0","xmm4"); 2121 &shufps ("xmm1","xmm1",0b11111111); # critical path 2122 &xorps ("xmm0","xmm1"); 2123 &ret(); 2124 2125 &set_label("key_256b",16); 2126 &$movekey (&QWP(0,$key),"xmm0"); 2127 &lea ($key,&DWP(16,$key)); 2128 2129 &shufps ("xmm4","xmm2",0b00010000); 2130 &xorps ("xmm2","xmm4"); 2131 &shufps ("xmm4","xmm2",0b10001100); 2132 &xorps ("xmm2","xmm4"); 2133 &shufps ("xmm1","xmm1",0b10101010); # critical path 2134 &xorps ("xmm2","xmm1"); 2135 &ret(); 2136 2137 &set_label("bad_pointer",4); 2138 &mov ("eax",-1); 2139 &ret (); 2140 &set_label("bad_keybits",4); 2141 &mov ("eax",-2); 2142 &ret (); 2143 &function_end_B("_aesni_set_encrypt_key"); 2144 2145 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2146 # AES_KEY *key) 2147 &function_begin_B("${PREFIX}_set_encrypt_key"); 2148 &mov ("eax",&wparam(0)); 2149 &mov ($rounds,&wparam(1)); 2150 &mov ($key,&wparam(2)); 2151 &call ("_aesni_set_encrypt_key"); 2152 &ret (); 2153 &function_end_B("${PREFIX}_set_encrypt_key"); 2154 2155 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2156 # AES_KEY *key) 2157 &function_begin_B("${PREFIX}_set_decrypt_key"); 2158 &mov ("eax",&wparam(0)); 2159 &mov ($rounds,&wparam(1)); 2160 &mov ($key,&wparam(2)); 2161 &call ("_aesni_set_encrypt_key"); 2162 &mov ($key,&wparam(2)); 2163 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key 2164 &test ("eax","eax"); 2165 &jnz (&label("dec_key_ret")); 2166 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2167 2168 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2169 &$movekey ("xmm1",&QWP(0,"eax")); 2170 &$movekey (&QWP(0,"eax"),"xmm0"); 2171 &$movekey (&QWP(0,$key),"xmm1"); 2172 &lea ($key,&DWP(16,$key)); 2173 &lea ("eax",&DWP(-16,"eax")); 2174 2175 &set_label("dec_key_inverse"); 2176 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2177 &$movekey ("xmm1",&QWP(0,"eax")); 2178 &aesimc ("xmm0","xmm0"); 2179 &aesimc ("xmm1","xmm1"); 2180 &lea ($key,&DWP(16,$key)); 2181 &lea ("eax",&DWP(-16,"eax")); 2182 &$movekey (&QWP(16,"eax"),"xmm0"); 2183 &$movekey (&QWP(-16,$key),"xmm1"); 2184 &cmp ("eax",$key); 2185 &ja (&label("dec_key_inverse")); 2186 2187 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2188 &aesimc ("xmm0","xmm0"); 2189 &$movekey (&QWP(0,$key),"xmm0"); 2190 2191 &xor ("eax","eax"); # return success 2192 &set_label("dec_key_ret"); 2193 &ret (); 2194 &function_end_B("${PREFIX}_set_decrypt_key"); 2195 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2196 2197 &asm_finish(); 2198