1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # To start with see corresponding paragraph in aesni-x86_64.pl... 18 # Instead of filling table similar to one found there I've chosen to 19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20 # The simplified table below represents 32-bit performance relative 21 # to 64-bit one in every given point. Ratios vary for different 22 # encryption modes, therefore interval values. 23 # 24 # 16-byte 64-byte 256-byte 1-KB 8-KB 25 # 53-67% 67-84% 91-94% 95-98% 97-99.5% 26 # 27 # Lower ratios for smaller block sizes are perfectly understandable, 28 # because function call overhead is higher in 32-bit mode. Largest 29 # 8-KB block performance is virtually same: 32-bit code is less than 30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32 # January 2011 33 # 34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module 35 # interleaves at most 6 aes[enc|dec] instructions, because there are 36 # not enough registers for 8x interleave [which should be optimal for 37 # Sandy Bridge]. Actually, performance results for 6x interleave 38 # factor presented in aesni-x86_64.pl (except for CTR) are for this 39 # module. 40 41 # April 2011 42 # 43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46 ###################################################################### 47 # Current large-block performance in cycles per byte processed with 48 # 128-bit key (less is better). 49 # 50 # CBC en-/decrypt CTR XTS ECB 51 # Westmere 3.77/1.37 1.37 1.52 1.27 52 # * Bridge 5.07/0.98 0.99 1.09 0.91 53 # Haswell 4.44/0.80 0.97 1.03 0.72 54 # Skylake 2.68/0.65 0.65 0.66 0.64 55 # Silvermont 5.77/3.56 3.67 4.03 3.46 56 # Goldmont 3.84/1.39 1.39 1.63 1.31 57 # Bulldozer 5.80/0.98 1.05 1.24 0.93 58 59 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 60 # generates drop-in replacement for 61 # crypto/aes/asm/aes-586.pl:-) 62 $inline=1; # inline _aesni_[en|de]crypt 63 64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65 push(@INC,"${dir}","${dir}../../../perlasm"); 66 require "x86asm.pl"; 67 68 $output = pop; 69 open OUT,">$output"; 70 *STDOUT=*OUT; 71 72 &asm_init($ARGV[0]); 73 74 &external_label("OPENSSL_ia32cap_P"); 75 &static_label("key_const"); 76 77 if ($PREFIX eq "aesni") { $movekey=\&movups; } 78 else { $movekey=\&movups; } 79 80 $len="eax"; 81 $rounds="ecx"; 82 $key="edx"; 83 $inp="esi"; 84 $out="edi"; 85 $rounds_="ebx"; # backup copy for $rounds 86 $key_="ebp"; # backup copy for $key 87 88 $rndkey0="xmm0"; 89 $rndkey1="xmm1"; 90 $inout0="xmm2"; 91 $inout1="xmm3"; 92 $inout2="xmm4"; 93 $inout3="xmm5"; $in1="xmm5"; 94 $inout4="xmm6"; $in0="xmm6"; 95 $inout5="xmm7"; $ivec="xmm7"; 96 97 # AESNI extension 98 sub aeskeygenassist 99 { my($dst,$src,$imm)=@_; 100 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 101 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 102 } 103 sub aescommon 104 { my($opcodelet,$dst,$src)=@_; 105 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 106 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 107 } 108 sub aesimc { aescommon(0xdb,@_); } 109 sub aesenc { aescommon(0xdc,@_); } 110 sub aesenclast { aescommon(0xdd,@_); } 111 sub aesdec { aescommon(0xde,@_); } 112 sub aesdeclast { aescommon(0xdf,@_); } 113 115 # Inline version of internal aesni_[en|de]crypt1 116 { my $sn; 117 sub aesni_inline_generate1 118 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 119 $sn++; 120 121 &$movekey ($rndkey0,&QWP(0,$key)); 122 &$movekey ($rndkey1,&QWP(16,$key)); 123 &xorps ($ivec,$rndkey0) if (defined($ivec)); 124 &lea ($key,&DWP(32,$key)); 125 &xorps ($inout,$ivec) if (defined($ivec)); 126 &xorps ($inout,$rndkey0) if (!defined($ivec)); 127 &set_label("${p}1_loop_$sn"); 128 eval"&aes${p} ($inout,$rndkey1)"; 129 &dec ($rounds); 130 &$movekey ($rndkey1,&QWP(0,$key)); 131 &lea ($key,&DWP(16,$key)); 132 &jnz (&label("${p}1_loop_$sn")); 133 eval"&aes${p}last ($inout,$rndkey1)"; 134 }} 135 136 sub aesni_generate1 # fully unrolled loop 137 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 138 139 &function_begin_B("_aesni_${p}rypt1"); 140 &movups ($rndkey0,&QWP(0,$key)); 141 &$movekey ($rndkey1,&QWP(0x10,$key)); 142 &xorps ($inout,$rndkey0); 143 &$movekey ($rndkey0,&QWP(0x20,$key)); 144 &lea ($key,&DWP(0x30,$key)); 145 &cmp ($rounds,11); 146 &jb (&label("${p}128")); 147 &lea ($key,&DWP(0x20,$key)); 148 &je (&label("${p}192")); 149 &lea ($key,&DWP(0x20,$key)); 150 eval"&aes${p} ($inout,$rndkey1)"; 151 &$movekey ($rndkey1,&QWP(-0x40,$key)); 152 eval"&aes${p} ($inout,$rndkey0)"; 153 &$movekey ($rndkey0,&QWP(-0x30,$key)); 154 &set_label("${p}192"); 155 eval"&aes${p} ($inout,$rndkey1)"; 156 &$movekey ($rndkey1,&QWP(-0x20,$key)); 157 eval"&aes${p} ($inout,$rndkey0)"; 158 &$movekey ($rndkey0,&QWP(-0x10,$key)); 159 &set_label("${p}128"); 160 eval"&aes${p} ($inout,$rndkey1)"; 161 &$movekey ($rndkey1,&QWP(0,$key)); 162 eval"&aes${p} ($inout,$rndkey0)"; 163 &$movekey ($rndkey0,&QWP(0x10,$key)); 164 eval"&aes${p} ($inout,$rndkey1)"; 165 &$movekey ($rndkey1,&QWP(0x20,$key)); 166 eval"&aes${p} ($inout,$rndkey0)"; 167 &$movekey ($rndkey0,&QWP(0x30,$key)); 168 eval"&aes${p} ($inout,$rndkey1)"; 169 &$movekey ($rndkey1,&QWP(0x40,$key)); 170 eval"&aes${p} ($inout,$rndkey0)"; 171 &$movekey ($rndkey0,&QWP(0x50,$key)); 172 eval"&aes${p} ($inout,$rndkey1)"; 173 &$movekey ($rndkey1,&QWP(0x60,$key)); 174 eval"&aes${p} ($inout,$rndkey0)"; 175 &$movekey ($rndkey0,&QWP(0x70,$key)); 176 eval"&aes${p} ($inout,$rndkey1)"; 177 eval"&aes${p}last ($inout,$rndkey0)"; 178 &ret(); 179 &function_end_B("_aesni_${p}rypt1"); 180 } 181 183 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 184 &aesni_generate1("enc") if (!$inline); 185 &function_begin_B("${PREFIX}_encrypt"); 186 &mov ("eax",&wparam(0)); 187 &mov ($key,&wparam(2)); 188 &movups ($inout0,&QWP(0,"eax")); 189 &mov ($rounds,&DWP(240,$key)); 190 &mov ("eax",&wparam(1)); 191 if ($inline) 192 { &aesni_inline_generate1("enc"); } 193 else 194 { &call ("_aesni_encrypt1"); } 195 &pxor ($rndkey0,$rndkey0); # clear register bank 196 &pxor ($rndkey1,$rndkey1); 197 &movups (&QWP(0,"eax"),$inout0); 198 &pxor ($inout0,$inout0); 199 &ret (); 200 &function_end_B("${PREFIX}_encrypt"); 201 202 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 203 &aesni_generate1("dec") if(!$inline); 204 &function_begin_B("${PREFIX}_decrypt"); 205 &mov ("eax",&wparam(0)); 206 &mov ($key,&wparam(2)); 207 &movups ($inout0,&QWP(0,"eax")); 208 &mov ($rounds,&DWP(240,$key)); 209 &mov ("eax",&wparam(1)); 210 if ($inline) 211 { &aesni_inline_generate1("dec"); } 212 else 213 { &call ("_aesni_decrypt1"); } 214 &pxor ($rndkey0,$rndkey0); # clear register bank 215 &pxor ($rndkey1,$rndkey1); 216 &movups (&QWP(0,"eax"),$inout0); 217 &pxor ($inout0,$inout0); 218 &ret (); 219 &function_end_B("${PREFIX}_decrypt"); 220 221 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 222 # factor. Why 3x subroutine were originally used in loops? Even though 223 # aes[enc|dec] latency was originally 6, it could be scheduled only 224 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 225 # utilization, i.e. when subroutine's throughput is virtually same as 226 # of non-interleaved subroutine [for number of input blocks up to 3]. 227 # This is why it originally made no sense to implement 2x subroutine. 228 # But times change and it became appropriate to spend extra 192 bytes 229 # on 2x subroutine on Atom Silvermont account. For processors that 230 # can schedule aes[enc|dec] every cycle optimal interleave factor 231 # equals to corresponding instructions latency. 8x is optimal for 232 # * Bridge, but it's unfeasible to accommodate such implementation 233 # in XMM registers addreassable in 32-bit mode and therefore maximum 234 # of 6x is used instead... 235 236 sub aesni_generate2 237 { my $p=shift; 238 239 &function_begin_B("_aesni_${p}rypt2"); 240 &$movekey ($rndkey0,&QWP(0,$key)); 241 &shl ($rounds,4); 242 &$movekey ($rndkey1,&QWP(16,$key)); 243 &xorps ($inout0,$rndkey0); 244 &pxor ($inout1,$rndkey0); 245 &$movekey ($rndkey0,&QWP(32,$key)); 246 &lea ($key,&DWP(32,$key,$rounds)); 247 &neg ($rounds); 248 &add ($rounds,16); 249 250 &set_label("${p}2_loop"); 251 eval"&aes${p} ($inout0,$rndkey1)"; 252 eval"&aes${p} ($inout1,$rndkey1)"; 253 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 254 &add ($rounds,32); 255 eval"&aes${p} ($inout0,$rndkey0)"; 256 eval"&aes${p} ($inout1,$rndkey0)"; 257 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 258 &jnz (&label("${p}2_loop")); 259 eval"&aes${p} ($inout0,$rndkey1)"; 260 eval"&aes${p} ($inout1,$rndkey1)"; 261 eval"&aes${p}last ($inout0,$rndkey0)"; 262 eval"&aes${p}last ($inout1,$rndkey0)"; 263 &ret(); 264 &function_end_B("_aesni_${p}rypt2"); 265 } 266 267 sub aesni_generate3 268 { my $p=shift; 269 270 &function_begin_B("_aesni_${p}rypt3"); 271 &$movekey ($rndkey0,&QWP(0,$key)); 272 &shl ($rounds,4); 273 &$movekey ($rndkey1,&QWP(16,$key)); 274 &xorps ($inout0,$rndkey0); 275 &pxor ($inout1,$rndkey0); 276 &pxor ($inout2,$rndkey0); 277 &$movekey ($rndkey0,&QWP(32,$key)); 278 &lea ($key,&DWP(32,$key,$rounds)); 279 &neg ($rounds); 280 &add ($rounds,16); 281 282 &set_label("${p}3_loop"); 283 eval"&aes${p} ($inout0,$rndkey1)"; 284 eval"&aes${p} ($inout1,$rndkey1)"; 285 eval"&aes${p} ($inout2,$rndkey1)"; 286 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 287 &add ($rounds,32); 288 eval"&aes${p} ($inout0,$rndkey0)"; 289 eval"&aes${p} ($inout1,$rndkey0)"; 290 eval"&aes${p} ($inout2,$rndkey0)"; 291 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 292 &jnz (&label("${p}3_loop")); 293 eval"&aes${p} ($inout0,$rndkey1)"; 294 eval"&aes${p} ($inout1,$rndkey1)"; 295 eval"&aes${p} ($inout2,$rndkey1)"; 296 eval"&aes${p}last ($inout0,$rndkey0)"; 297 eval"&aes${p}last ($inout1,$rndkey0)"; 298 eval"&aes${p}last ($inout2,$rndkey0)"; 299 &ret(); 300 &function_end_B("_aesni_${p}rypt3"); 301 } 302 303 # 4x interleave is implemented to improve small block performance, 304 # most notably [and naturally] 4 block by ~30%. One can argue that one 305 # should have implemented 5x as well, but improvement would be <20%, 306 # so it's not worth it... 307 sub aesni_generate4 308 { my $p=shift; 309 310 &function_begin_B("_aesni_${p}rypt4"); 311 &$movekey ($rndkey0,&QWP(0,$key)); 312 &$movekey ($rndkey1,&QWP(16,$key)); 313 &shl ($rounds,4); 314 &xorps ($inout0,$rndkey0); 315 &pxor ($inout1,$rndkey0); 316 &pxor ($inout2,$rndkey0); 317 &pxor ($inout3,$rndkey0); 318 &$movekey ($rndkey0,&QWP(32,$key)); 319 &lea ($key,&DWP(32,$key,$rounds)); 320 &neg ($rounds); 321 &data_byte (0x0f,0x1f,0x40,0x00); 322 &add ($rounds,16); 323 324 &set_label("${p}4_loop"); 325 eval"&aes${p} ($inout0,$rndkey1)"; 326 eval"&aes${p} ($inout1,$rndkey1)"; 327 eval"&aes${p} ($inout2,$rndkey1)"; 328 eval"&aes${p} ($inout3,$rndkey1)"; 329 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 330 &add ($rounds,32); 331 eval"&aes${p} ($inout0,$rndkey0)"; 332 eval"&aes${p} ($inout1,$rndkey0)"; 333 eval"&aes${p} ($inout2,$rndkey0)"; 334 eval"&aes${p} ($inout3,$rndkey0)"; 335 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 336 &jnz (&label("${p}4_loop")); 337 338 eval"&aes${p} ($inout0,$rndkey1)"; 339 eval"&aes${p} ($inout1,$rndkey1)"; 340 eval"&aes${p} ($inout2,$rndkey1)"; 341 eval"&aes${p} ($inout3,$rndkey1)"; 342 eval"&aes${p}last ($inout0,$rndkey0)"; 343 eval"&aes${p}last ($inout1,$rndkey0)"; 344 eval"&aes${p}last ($inout2,$rndkey0)"; 345 eval"&aes${p}last ($inout3,$rndkey0)"; 346 &ret(); 347 &function_end_B("_aesni_${p}rypt4"); 348 } 349 350 sub aesni_generate6 351 { my $p=shift; 352 353 &function_begin_B("_aesni_${p}rypt6"); 354 &static_label("_aesni_${p}rypt6_enter"); 355 &$movekey ($rndkey0,&QWP(0,$key)); 356 &shl ($rounds,4); 357 &$movekey ($rndkey1,&QWP(16,$key)); 358 &xorps ($inout0,$rndkey0); 359 &pxor ($inout1,$rndkey0); # pxor does better here 360 &pxor ($inout2,$rndkey0); 361 eval"&aes${p} ($inout0,$rndkey1)"; 362 &pxor ($inout3,$rndkey0); 363 &pxor ($inout4,$rndkey0); 364 eval"&aes${p} ($inout1,$rndkey1)"; 365 &lea ($key,&DWP(32,$key,$rounds)); 366 &neg ($rounds); 367 eval"&aes${p} ($inout2,$rndkey1)"; 368 &pxor ($inout5,$rndkey0); 369 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 370 &add ($rounds,16); 371 &jmp (&label("_aesni_${p}rypt6_inner")); 372 373 &set_label("${p}6_loop",16); 374 eval"&aes${p} ($inout0,$rndkey1)"; 375 eval"&aes${p} ($inout1,$rndkey1)"; 376 eval"&aes${p} ($inout2,$rndkey1)"; 377 &set_label("_aesni_${p}rypt6_inner"); 378 eval"&aes${p} ($inout3,$rndkey1)"; 379 eval"&aes${p} ($inout4,$rndkey1)"; 380 eval"&aes${p} ($inout5,$rndkey1)"; 381 &set_label("_aesni_${p}rypt6_enter"); 382 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 383 &add ($rounds,32); 384 eval"&aes${p} ($inout0,$rndkey0)"; 385 eval"&aes${p} ($inout1,$rndkey0)"; 386 eval"&aes${p} ($inout2,$rndkey0)"; 387 eval"&aes${p} ($inout3,$rndkey0)"; 388 eval"&aes${p} ($inout4,$rndkey0)"; 389 eval"&aes${p} ($inout5,$rndkey0)"; 390 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 391 &jnz (&label("${p}6_loop")); 392 393 eval"&aes${p} ($inout0,$rndkey1)"; 394 eval"&aes${p} ($inout1,$rndkey1)"; 395 eval"&aes${p} ($inout2,$rndkey1)"; 396 eval"&aes${p} ($inout3,$rndkey1)"; 397 eval"&aes${p} ($inout4,$rndkey1)"; 398 eval"&aes${p} ($inout5,$rndkey1)"; 399 eval"&aes${p}last ($inout0,$rndkey0)"; 400 eval"&aes${p}last ($inout1,$rndkey0)"; 401 eval"&aes${p}last ($inout2,$rndkey0)"; 402 eval"&aes${p}last ($inout3,$rndkey0)"; 403 eval"&aes${p}last ($inout4,$rndkey0)"; 404 eval"&aes${p}last ($inout5,$rndkey0)"; 405 &ret(); 406 &function_end_B("_aesni_${p}rypt6"); 407 } 408 &aesni_generate2("enc") if ($PREFIX eq "aesni"); 409 &aesni_generate2("dec"); 410 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 411 &aesni_generate3("dec"); 412 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 413 &aesni_generate4("dec"); 414 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 415 &aesni_generate6("dec"); 416 418 if ($PREFIX eq "aesni") { 419 ###################################################################### 420 # void aesni_ecb_encrypt (const void *in, void *out, 421 # size_t length, const AES_KEY *key, 422 # int enc); 423 &function_begin("aesni_ecb_encrypt"); 424 &mov ($inp,&wparam(0)); 425 &mov ($out,&wparam(1)); 426 &mov ($len,&wparam(2)); 427 &mov ($key,&wparam(3)); 428 &mov ($rounds_,&wparam(4)); 429 &and ($len,-16); 430 &jz (&label("ecb_ret")); 431 &mov ($rounds,&DWP(240,$key)); 432 &test ($rounds_,$rounds_); 433 &jz (&label("ecb_decrypt")); 434 435 &mov ($key_,$key); # backup $key 436 &mov ($rounds_,$rounds); # backup $rounds 437 &cmp ($len,0x60); 438 &jb (&label("ecb_enc_tail")); 439 440 &movdqu ($inout0,&QWP(0,$inp)); 441 &movdqu ($inout1,&QWP(0x10,$inp)); 442 &movdqu ($inout2,&QWP(0x20,$inp)); 443 &movdqu ($inout3,&QWP(0x30,$inp)); 444 &movdqu ($inout4,&QWP(0x40,$inp)); 445 &movdqu ($inout5,&QWP(0x50,$inp)); 446 &lea ($inp,&DWP(0x60,$inp)); 447 &sub ($len,0x60); 448 &jmp (&label("ecb_enc_loop6_enter")); 449 450 &set_label("ecb_enc_loop6",16); 451 &movups (&QWP(0,$out),$inout0); 452 &movdqu ($inout0,&QWP(0,$inp)); 453 &movups (&QWP(0x10,$out),$inout1); 454 &movdqu ($inout1,&QWP(0x10,$inp)); 455 &movups (&QWP(0x20,$out),$inout2); 456 &movdqu ($inout2,&QWP(0x20,$inp)); 457 &movups (&QWP(0x30,$out),$inout3); 458 &movdqu ($inout3,&QWP(0x30,$inp)); 459 &movups (&QWP(0x40,$out),$inout4); 460 &movdqu ($inout4,&QWP(0x40,$inp)); 461 &movups (&QWP(0x50,$out),$inout5); 462 &lea ($out,&DWP(0x60,$out)); 463 &movdqu ($inout5,&QWP(0x50,$inp)); 464 &lea ($inp,&DWP(0x60,$inp)); 465 &set_label("ecb_enc_loop6_enter"); 466 467 &call ("_aesni_encrypt6"); 468 469 &mov ($key,$key_); # restore $key 470 &mov ($rounds,$rounds_); # restore $rounds 471 &sub ($len,0x60); 472 &jnc (&label("ecb_enc_loop6")); 473 474 &movups (&QWP(0,$out),$inout0); 475 &movups (&QWP(0x10,$out),$inout1); 476 &movups (&QWP(0x20,$out),$inout2); 477 &movups (&QWP(0x30,$out),$inout3); 478 &movups (&QWP(0x40,$out),$inout4); 479 &movups (&QWP(0x50,$out),$inout5); 480 &lea ($out,&DWP(0x60,$out)); 481 &add ($len,0x60); 482 &jz (&label("ecb_ret")); 483 484 &set_label("ecb_enc_tail"); 485 &movups ($inout0,&QWP(0,$inp)); 486 &cmp ($len,0x20); 487 &jb (&label("ecb_enc_one")); 488 &movups ($inout1,&QWP(0x10,$inp)); 489 &je (&label("ecb_enc_two")); 490 &movups ($inout2,&QWP(0x20,$inp)); 491 &cmp ($len,0x40); 492 &jb (&label("ecb_enc_three")); 493 &movups ($inout3,&QWP(0x30,$inp)); 494 &je (&label("ecb_enc_four")); 495 &movups ($inout4,&QWP(0x40,$inp)); 496 &xorps ($inout5,$inout5); 497 &call ("_aesni_encrypt6"); 498 &movups (&QWP(0,$out),$inout0); 499 &movups (&QWP(0x10,$out),$inout1); 500 &movups (&QWP(0x20,$out),$inout2); 501 &movups (&QWP(0x30,$out),$inout3); 502 &movups (&QWP(0x40,$out),$inout4); 503 jmp (&label("ecb_ret")); 504 505 &set_label("ecb_enc_one",16); 506 if ($inline) 507 { &aesni_inline_generate1("enc"); } 508 else 509 { &call ("_aesni_encrypt1"); } 510 &movups (&QWP(0,$out),$inout0); 511 &jmp (&label("ecb_ret")); 512 513 &set_label("ecb_enc_two",16); 514 &call ("_aesni_encrypt2"); 515 &movups (&QWP(0,$out),$inout0); 516 &movups (&QWP(0x10,$out),$inout1); 517 &jmp (&label("ecb_ret")); 518 519 &set_label("ecb_enc_three",16); 520 &call ("_aesni_encrypt3"); 521 &movups (&QWP(0,$out),$inout0); 522 &movups (&QWP(0x10,$out),$inout1); 523 &movups (&QWP(0x20,$out),$inout2); 524 &jmp (&label("ecb_ret")); 525 526 &set_label("ecb_enc_four",16); 527 &call ("_aesni_encrypt4"); 528 &movups (&QWP(0,$out),$inout0); 529 &movups (&QWP(0x10,$out),$inout1); 530 &movups (&QWP(0x20,$out),$inout2); 531 &movups (&QWP(0x30,$out),$inout3); 532 &jmp (&label("ecb_ret")); 533 ###################################################################### 534 &set_label("ecb_decrypt",16); 535 &mov ($key_,$key); # backup $key 536 &mov ($rounds_,$rounds); # backup $rounds 537 &cmp ($len,0x60); 538 &jb (&label("ecb_dec_tail")); 539 540 &movdqu ($inout0,&QWP(0,$inp)); 541 &movdqu ($inout1,&QWP(0x10,$inp)); 542 &movdqu ($inout2,&QWP(0x20,$inp)); 543 &movdqu ($inout3,&QWP(0x30,$inp)); 544 &movdqu ($inout4,&QWP(0x40,$inp)); 545 &movdqu ($inout5,&QWP(0x50,$inp)); 546 &lea ($inp,&DWP(0x60,$inp)); 547 &sub ($len,0x60); 548 &jmp (&label("ecb_dec_loop6_enter")); 549 550 &set_label("ecb_dec_loop6",16); 551 &movups (&QWP(0,$out),$inout0); 552 &movdqu ($inout0,&QWP(0,$inp)); 553 &movups (&QWP(0x10,$out),$inout1); 554 &movdqu ($inout1,&QWP(0x10,$inp)); 555 &movups (&QWP(0x20,$out),$inout2); 556 &movdqu ($inout2,&QWP(0x20,$inp)); 557 &movups (&QWP(0x30,$out),$inout3); 558 &movdqu ($inout3,&QWP(0x30,$inp)); 559 &movups (&QWP(0x40,$out),$inout4); 560 &movdqu ($inout4,&QWP(0x40,$inp)); 561 &movups (&QWP(0x50,$out),$inout5); 562 &lea ($out,&DWP(0x60,$out)); 563 &movdqu ($inout5,&QWP(0x50,$inp)); 564 &lea ($inp,&DWP(0x60,$inp)); 565 &set_label("ecb_dec_loop6_enter"); 566 567 &call ("_aesni_decrypt6"); 568 569 &mov ($key,$key_); # restore $key 570 &mov ($rounds,$rounds_); # restore $rounds 571 &sub ($len,0x60); 572 &jnc (&label("ecb_dec_loop6")); 573 574 &movups (&QWP(0,$out),$inout0); 575 &movups (&QWP(0x10,$out),$inout1); 576 &movups (&QWP(0x20,$out),$inout2); 577 &movups (&QWP(0x30,$out),$inout3); 578 &movups (&QWP(0x40,$out),$inout4); 579 &movups (&QWP(0x50,$out),$inout5); 580 &lea ($out,&DWP(0x60,$out)); 581 &add ($len,0x60); 582 &jz (&label("ecb_ret")); 583 584 &set_label("ecb_dec_tail"); 585 &movups ($inout0,&QWP(0,$inp)); 586 &cmp ($len,0x20); 587 &jb (&label("ecb_dec_one")); 588 &movups ($inout1,&QWP(0x10,$inp)); 589 &je (&label("ecb_dec_two")); 590 &movups ($inout2,&QWP(0x20,$inp)); 591 &cmp ($len,0x40); 592 &jb (&label("ecb_dec_three")); 593 &movups ($inout3,&QWP(0x30,$inp)); 594 &je (&label("ecb_dec_four")); 595 &movups ($inout4,&QWP(0x40,$inp)); 596 &xorps ($inout5,$inout5); 597 &call ("_aesni_decrypt6"); 598 &movups (&QWP(0,$out),$inout0); 599 &movups (&QWP(0x10,$out),$inout1); 600 &movups (&QWP(0x20,$out),$inout2); 601 &movups (&QWP(0x30,$out),$inout3); 602 &movups (&QWP(0x40,$out),$inout4); 603 &jmp (&label("ecb_ret")); 604 605 &set_label("ecb_dec_one",16); 606 if ($inline) 607 { &aesni_inline_generate1("dec"); } 608 else 609 { &call ("_aesni_decrypt1"); } 610 &movups (&QWP(0,$out),$inout0); 611 &jmp (&label("ecb_ret")); 612 613 &set_label("ecb_dec_two",16); 614 &call ("_aesni_decrypt2"); 615 &movups (&QWP(0,$out),$inout0); 616 &movups (&QWP(0x10,$out),$inout1); 617 &jmp (&label("ecb_ret")); 618 619 &set_label("ecb_dec_three",16); 620 &call ("_aesni_decrypt3"); 621 &movups (&QWP(0,$out),$inout0); 622 &movups (&QWP(0x10,$out),$inout1); 623 &movups (&QWP(0x20,$out),$inout2); 624 &jmp (&label("ecb_ret")); 625 626 &set_label("ecb_dec_four",16); 627 &call ("_aesni_decrypt4"); 628 &movups (&QWP(0,$out),$inout0); 629 &movups (&QWP(0x10,$out),$inout1); 630 &movups (&QWP(0x20,$out),$inout2); 631 &movups (&QWP(0x30,$out),$inout3); 632 633 &set_label("ecb_ret"); 634 &pxor ("xmm0","xmm0"); # clear register bank 635 &pxor ("xmm1","xmm1"); 636 &pxor ("xmm2","xmm2"); 637 &pxor ("xmm3","xmm3"); 638 &pxor ("xmm4","xmm4"); 639 &pxor ("xmm5","xmm5"); 640 &pxor ("xmm6","xmm6"); 641 &pxor ("xmm7","xmm7"); 642 &function_end("aesni_ecb_encrypt"); 643 645 ###################################################################### 646 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 647 # size_t blocks, const AES_KEY *key, 648 # const char *ivec,char *cmac); 649 # 650 # Handles only complete blocks, operates on 64-bit counter and 651 # does not update *ivec! Nor does it finalize CMAC value 652 # (see engine/eng_aesni.c for details) 653 # 654 { my $cmac=$inout1; 655 &function_begin("aesni_ccm64_encrypt_blocks"); 656 &mov ($inp,&wparam(0)); 657 &mov ($out,&wparam(1)); 658 &mov ($len,&wparam(2)); 659 &mov ($key,&wparam(3)); 660 &mov ($rounds_,&wparam(4)); 661 &mov ($rounds,&wparam(5)); 662 &mov ($key_,"esp"); 663 &sub ("esp",60); 664 &and ("esp",-16); # align stack 665 &mov (&DWP(48,"esp"),$key_); 666 667 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 668 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 669 &mov ($rounds,&DWP(240,$key)); 670 671 # compose byte-swap control mask for pshufb on stack 672 &mov (&DWP(0,"esp"),0x0c0d0e0f); 673 &mov (&DWP(4,"esp"),0x08090a0b); 674 &mov (&DWP(8,"esp"),0x04050607); 675 &mov (&DWP(12,"esp"),0x00010203); 676 677 # compose counter increment vector on stack 678 &mov ($rounds_,1); 679 &xor ($key_,$key_); 680 &mov (&DWP(16,"esp"),$rounds_); 681 &mov (&DWP(20,"esp"),$key_); 682 &mov (&DWP(24,"esp"),$key_); 683 &mov (&DWP(28,"esp"),$key_); 684 685 &shl ($rounds,4); 686 &mov ($rounds_,16); 687 &lea ($key_,&DWP(0,$key)); 688 &movdqa ($inout3,&QWP(0,"esp")); 689 &movdqa ($inout0,$ivec); 690 &lea ($key,&DWP(32,$key,$rounds)); 691 &sub ($rounds_,$rounds); 692 &pshufb ($ivec,$inout3); 693 694 &set_label("ccm64_enc_outer"); 695 &$movekey ($rndkey0,&QWP(0,$key_)); 696 &mov ($rounds,$rounds_); 697 &movups ($in0,&QWP(0,$inp)); 698 699 &xorps ($inout0,$rndkey0); 700 &$movekey ($rndkey1,&QWP(16,$key_)); 701 &xorps ($rndkey0,$in0); 702 &xorps ($cmac,$rndkey0); # cmac^=inp 703 &$movekey ($rndkey0,&QWP(32,$key_)); 704 705 &set_label("ccm64_enc2_loop"); 706 &aesenc ($inout0,$rndkey1); 707 &aesenc ($cmac,$rndkey1); 708 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 709 &add ($rounds,32); 710 &aesenc ($inout0,$rndkey0); 711 &aesenc ($cmac,$rndkey0); 712 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 713 &jnz (&label("ccm64_enc2_loop")); 714 &aesenc ($inout0,$rndkey1); 715 &aesenc ($cmac,$rndkey1); 716 &paddq ($ivec,&QWP(16,"esp")); 717 &dec ($len); 718 &aesenclast ($inout0,$rndkey0); 719 &aesenclast ($cmac,$rndkey0); 720 721 &lea ($inp,&DWP(16,$inp)); 722 &xorps ($in0,$inout0); # inp^=E(ivec) 723 &movdqa ($inout0,$ivec); 724 &movups (&QWP(0,$out),$in0); # save output 725 &pshufb ($inout0,$inout3); 726 &lea ($out,&DWP(16,$out)); 727 &jnz (&label("ccm64_enc_outer")); 728 729 &mov ("esp",&DWP(48,"esp")); 730 &mov ($out,&wparam(5)); 731 &movups (&QWP(0,$out),$cmac); 732 733 &pxor ("xmm0","xmm0"); # clear register bank 734 &pxor ("xmm1","xmm1"); 735 &pxor ("xmm2","xmm2"); 736 &pxor ("xmm3","xmm3"); 737 &pxor ("xmm4","xmm4"); 738 &pxor ("xmm5","xmm5"); 739 &pxor ("xmm6","xmm6"); 740 &pxor ("xmm7","xmm7"); 741 &function_end("aesni_ccm64_encrypt_blocks"); 742 743 &function_begin("aesni_ccm64_decrypt_blocks"); 744 &mov ($inp,&wparam(0)); 745 &mov ($out,&wparam(1)); 746 &mov ($len,&wparam(2)); 747 &mov ($key,&wparam(3)); 748 &mov ($rounds_,&wparam(4)); 749 &mov ($rounds,&wparam(5)); 750 &mov ($key_,"esp"); 751 &sub ("esp",60); 752 &and ("esp",-16); # align stack 753 &mov (&DWP(48,"esp"),$key_); 754 755 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 756 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 757 &mov ($rounds,&DWP(240,$key)); 758 759 # compose byte-swap control mask for pshufb on stack 760 &mov (&DWP(0,"esp"),0x0c0d0e0f); 761 &mov (&DWP(4,"esp"),0x08090a0b); 762 &mov (&DWP(8,"esp"),0x04050607); 763 &mov (&DWP(12,"esp"),0x00010203); 764 765 # compose counter increment vector on stack 766 &mov ($rounds_,1); 767 &xor ($key_,$key_); 768 &mov (&DWP(16,"esp"),$rounds_); 769 &mov (&DWP(20,"esp"),$key_); 770 &mov (&DWP(24,"esp"),$key_); 771 &mov (&DWP(28,"esp"),$key_); 772 773 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 774 &movdqa ($inout0,$ivec); 775 776 &mov ($key_,$key); 777 &mov ($rounds_,$rounds); 778 779 &pshufb ($ivec,$inout3); 780 if ($inline) 781 { &aesni_inline_generate1("enc"); } 782 else 783 { &call ("_aesni_encrypt1"); } 784 &shl ($rounds_,4); 785 &mov ($rounds,16); 786 &movups ($in0,&QWP(0,$inp)); # load inp 787 &paddq ($ivec,&QWP(16,"esp")); 788 &lea ($inp,&QWP(16,$inp)); 789 &sub ($rounds,$rounds_); 790 &lea ($key,&DWP(32,$key_,$rounds_)); 791 &mov ($rounds_,$rounds); 792 &jmp (&label("ccm64_dec_outer")); 793 794 &set_label("ccm64_dec_outer",16); 795 &xorps ($in0,$inout0); # inp ^= E(ivec) 796 &movdqa ($inout0,$ivec); 797 &movups (&QWP(0,$out),$in0); # save output 798 &lea ($out,&DWP(16,$out)); 799 &pshufb ($inout0,$inout3); 800 801 &sub ($len,1); 802 &jz (&label("ccm64_dec_break")); 803 804 &$movekey ($rndkey0,&QWP(0,$key_)); 805 &mov ($rounds,$rounds_); 806 &$movekey ($rndkey1,&QWP(16,$key_)); 807 &xorps ($in0,$rndkey0); 808 &xorps ($inout0,$rndkey0); 809 &xorps ($cmac,$in0); # cmac^=out 810 &$movekey ($rndkey0,&QWP(32,$key_)); 811 812 &set_label("ccm64_dec2_loop"); 813 &aesenc ($inout0,$rndkey1); 814 &aesenc ($cmac,$rndkey1); 815 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 816 &add ($rounds,32); 817 &aesenc ($inout0,$rndkey0); 818 &aesenc ($cmac,$rndkey0); 819 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 820 &jnz (&label("ccm64_dec2_loop")); 821 &movups ($in0,&QWP(0,$inp)); # load inp 822 &paddq ($ivec,&QWP(16,"esp")); 823 &aesenc ($inout0,$rndkey1); 824 &aesenc ($cmac,$rndkey1); 825 &aesenclast ($inout0,$rndkey0); 826 &aesenclast ($cmac,$rndkey0); 827 &lea ($inp,&QWP(16,$inp)); 828 &jmp (&label("ccm64_dec_outer")); 829 830 &set_label("ccm64_dec_break",16); 831 &mov ($rounds,&DWP(240,$key_)); 832 &mov ($key,$key_); 833 if ($inline) 834 { &aesni_inline_generate1("enc",$cmac,$in0); } 835 else 836 { &call ("_aesni_encrypt1",$cmac); } 837 838 &mov ("esp",&DWP(48,"esp")); 839 &mov ($out,&wparam(5)); 840 &movups (&QWP(0,$out),$cmac); 841 842 &pxor ("xmm0","xmm0"); # clear register bank 843 &pxor ("xmm1","xmm1"); 844 &pxor ("xmm2","xmm2"); 845 &pxor ("xmm3","xmm3"); 846 &pxor ("xmm4","xmm4"); 847 &pxor ("xmm5","xmm5"); 848 &pxor ("xmm6","xmm6"); 849 &pxor ("xmm7","xmm7"); 850 &function_end("aesni_ccm64_decrypt_blocks"); 851 } 852 854 ###################################################################### 855 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 856 # size_t blocks, const AES_KEY *key, 857 # const char *ivec); 858 # 859 # Handles only complete blocks, operates on 32-bit counter and 860 # does not update *ivec! (see crypto/modes/ctr128.c for details) 861 # 862 # stack layout: 863 # 0 pshufb mask 864 # 16 vector addend: 0,6,6,6 865 # 32 counter-less ivec 866 # 48 1st triplet of counter vector 867 # 64 2nd triplet of counter vector 868 # 80 saved %esp 869 870 &function_begin("aesni_ctr32_encrypt_blocks"); 871 &mov ($inp,&wparam(0)); 872 &mov ($out,&wparam(1)); 873 &mov ($len,&wparam(2)); 874 &mov ($key,&wparam(3)); 875 &mov ($rounds_,&wparam(4)); 876 &mov ($key_,"esp"); 877 &sub ("esp",88); 878 &and ("esp",-16); # align stack 879 &mov (&DWP(80,"esp"),$key_); 880 881 &cmp ($len,1); 882 &je (&label("ctr32_one_shortcut")); 883 884 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 885 886 # compose byte-swap control mask for pshufb on stack 887 &mov (&DWP(0,"esp"),0x0c0d0e0f); 888 &mov (&DWP(4,"esp"),0x08090a0b); 889 &mov (&DWP(8,"esp"),0x04050607); 890 &mov (&DWP(12,"esp"),0x00010203); 891 892 # compose counter increment vector on stack 893 &mov ($rounds,6); 894 &xor ($key_,$key_); 895 &mov (&DWP(16,"esp"),$rounds); 896 &mov (&DWP(20,"esp"),$rounds); 897 &mov (&DWP(24,"esp"),$rounds); 898 &mov (&DWP(28,"esp"),$key_); 899 900 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 901 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 902 903 &mov ($rounds,&DWP(240,$key)); # key->rounds 904 905 # compose 2 vectors of 3x32-bit counters 906 &bswap ($rounds_); 907 &pxor ($rndkey0,$rndkey0); 908 &pxor ($rndkey1,$rndkey1); 909 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 910 &pinsrd ($rndkey0,$rounds_,0); 911 &lea ($key_,&DWP(3,$rounds_)); 912 &pinsrd ($rndkey1,$key_,0); 913 &inc ($rounds_); 914 &pinsrd ($rndkey0,$rounds_,1); 915 &inc ($key_); 916 &pinsrd ($rndkey1,$key_,1); 917 &inc ($rounds_); 918 &pinsrd ($rndkey0,$rounds_,2); 919 &inc ($key_); 920 &pinsrd ($rndkey1,$key_,2); 921 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 922 &pshufb ($rndkey0,$inout0); # byte swap 923 &movdqu ($inout4,&QWP(0,$key)); # key[0] 924 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 925 &pshufb ($rndkey1,$inout0); # byte swap 926 927 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 928 &pshufd ($inout1,$rndkey0,2<<6); 929 &cmp ($len,6); 930 &jb (&label("ctr32_tail")); 931 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 932 &shl ($rounds,4); 933 &mov ($rounds_,16); 934 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 935 &mov ($key_,$key); # backup $key 936 &sub ($rounds_,$rounds); # backup twisted $rounds 937 &lea ($key,&DWP(32,$key,$rounds)); 938 &sub ($len,6); 939 &jmp (&label("ctr32_loop6")); 940 941 &set_label("ctr32_loop6",16); 942 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 943 &pshufd ($inout2,$rndkey0,1<<6); 944 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 945 &pshufd ($inout3,$rndkey1,3<<6); 946 &pxor ($inout0,$rndkey0); # merge counter-less ivec 947 &pshufd ($inout4,$rndkey1,2<<6); 948 &pxor ($inout1,$rndkey0); 949 &pshufd ($inout5,$rndkey1,1<<6); 950 &$movekey ($rndkey1,&QWP(16,$key_)); 951 &pxor ($inout2,$rndkey0); 952 &pxor ($inout3,$rndkey0); 953 &aesenc ($inout0,$rndkey1); 954 &pxor ($inout4,$rndkey0); 955 &pxor ($inout5,$rndkey0); 956 &aesenc ($inout1,$rndkey1); 957 &$movekey ($rndkey0,&QWP(32,$key_)); 958 &mov ($rounds,$rounds_); 959 &aesenc ($inout2,$rndkey1); 960 &aesenc ($inout3,$rndkey1); 961 &aesenc ($inout4,$rndkey1); 962 &aesenc ($inout5,$rndkey1); 963 964 &call (&label("_aesni_encrypt6_enter")); 965 966 &movups ($rndkey1,&QWP(0,$inp)); 967 &movups ($rndkey0,&QWP(0x10,$inp)); 968 &xorps ($inout0,$rndkey1); 969 &movups ($rndkey1,&QWP(0x20,$inp)); 970 &xorps ($inout1,$rndkey0); 971 &movups (&QWP(0,$out),$inout0); 972 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 973 &xorps ($inout2,$rndkey1); 974 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 975 &movups (&QWP(0x10,$out),$inout1); 976 &movups (&QWP(0x20,$out),$inout2); 977 978 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 979 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 980 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 981 982 &movups ($inout1,&QWP(0x30,$inp)); 983 &movups ($inout2,&QWP(0x40,$inp)); 984 &xorps ($inout3,$inout1); 985 &movups ($inout1,&QWP(0x50,$inp)); 986 &lea ($inp,&DWP(0x60,$inp)); 987 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 988 &pshufb ($rndkey0,$inout0); # byte swap 989 &xorps ($inout4,$inout2); 990 &movups (&QWP(0x30,$out),$inout3); 991 &xorps ($inout5,$inout1); 992 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 993 &pshufb ($rndkey1,$inout0); # byte swap 994 &movups (&QWP(0x40,$out),$inout4); 995 &pshufd ($inout0,$rndkey0,3<<6); 996 &movups (&QWP(0x50,$out),$inout5); 997 &lea ($out,&DWP(0x60,$out)); 998 999 &pshufd ($inout1,$rndkey0,2<<6); 1000 &sub ($len,6); 1001 &jnc (&label("ctr32_loop6")); 1002 1003 &add ($len,6); 1004 &jz (&label("ctr32_ret")); 1005 &movdqu ($inout5,&QWP(0,$key_)); 1006 &mov ($key,$key_); 1007 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1008 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1009 1010 &set_label("ctr32_tail"); 1011 &por ($inout0,$inout5); 1012 &cmp ($len,2); 1013 &jb (&label("ctr32_one")); 1014 1015 &pshufd ($inout2,$rndkey0,1<<6); 1016 &por ($inout1,$inout5); 1017 &je (&label("ctr32_two")); 1018 1019 &pshufd ($inout3,$rndkey1,3<<6); 1020 &por ($inout2,$inout5); 1021 &cmp ($len,4); 1022 &jb (&label("ctr32_three")); 1023 1024 &pshufd ($inout4,$rndkey1,2<<6); 1025 &por ($inout3,$inout5); 1026 &je (&label("ctr32_four")); 1027 1028 &por ($inout4,$inout5); 1029 &call ("_aesni_encrypt6"); 1030 &movups ($rndkey1,&QWP(0,$inp)); 1031 &movups ($rndkey0,&QWP(0x10,$inp)); 1032 &xorps ($inout0,$rndkey1); 1033 &movups ($rndkey1,&QWP(0x20,$inp)); 1034 &xorps ($inout1,$rndkey0); 1035 &movups ($rndkey0,&QWP(0x30,$inp)); 1036 &xorps ($inout2,$rndkey1); 1037 &movups ($rndkey1,&QWP(0x40,$inp)); 1038 &xorps ($inout3,$rndkey0); 1039 &movups (&QWP(0,$out),$inout0); 1040 &xorps ($inout4,$rndkey1); 1041 &movups (&QWP(0x10,$out),$inout1); 1042 &movups (&QWP(0x20,$out),$inout2); 1043 &movups (&QWP(0x30,$out),$inout3); 1044 &movups (&QWP(0x40,$out),$inout4); 1045 &jmp (&label("ctr32_ret")); 1046 1047 &set_label("ctr32_one_shortcut",16); 1048 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1049 &mov ($rounds,&DWP(240,$key)); 1050 1051 &set_label("ctr32_one"); 1052 if ($inline) 1053 { &aesni_inline_generate1("enc"); } 1054 else 1055 { &call ("_aesni_encrypt1"); } 1056 &movups ($in0,&QWP(0,$inp)); 1057 &xorps ($in0,$inout0); 1058 &movups (&QWP(0,$out),$in0); 1059 &jmp (&label("ctr32_ret")); 1060 1061 &set_label("ctr32_two",16); 1062 &call ("_aesni_encrypt2"); 1063 &movups ($inout3,&QWP(0,$inp)); 1064 &movups ($inout4,&QWP(0x10,$inp)); 1065 &xorps ($inout0,$inout3); 1066 &xorps ($inout1,$inout4); 1067 &movups (&QWP(0,$out),$inout0); 1068 &movups (&QWP(0x10,$out),$inout1); 1069 &jmp (&label("ctr32_ret")); 1070 1071 &set_label("ctr32_three",16); 1072 &call ("_aesni_encrypt3"); 1073 &movups ($inout3,&QWP(0,$inp)); 1074 &movups ($inout4,&QWP(0x10,$inp)); 1075 &xorps ($inout0,$inout3); 1076 &movups ($inout5,&QWP(0x20,$inp)); 1077 &xorps ($inout1,$inout4); 1078 &movups (&QWP(0,$out),$inout0); 1079 &xorps ($inout2,$inout5); 1080 &movups (&QWP(0x10,$out),$inout1); 1081 &movups (&QWP(0x20,$out),$inout2); 1082 &jmp (&label("ctr32_ret")); 1083 1084 &set_label("ctr32_four",16); 1085 &call ("_aesni_encrypt4"); 1086 &movups ($inout4,&QWP(0,$inp)); 1087 &movups ($inout5,&QWP(0x10,$inp)); 1088 &movups ($rndkey1,&QWP(0x20,$inp)); 1089 &xorps ($inout0,$inout4); 1090 &movups ($rndkey0,&QWP(0x30,$inp)); 1091 &xorps ($inout1,$inout5); 1092 &movups (&QWP(0,$out),$inout0); 1093 &xorps ($inout2,$rndkey1); 1094 &movups (&QWP(0x10,$out),$inout1); 1095 &xorps ($inout3,$rndkey0); 1096 &movups (&QWP(0x20,$out),$inout2); 1097 &movups (&QWP(0x30,$out),$inout3); 1098 1099 &set_label("ctr32_ret"); 1100 &pxor ("xmm0","xmm0"); # clear register bank 1101 &pxor ("xmm1","xmm1"); 1102 &pxor ("xmm2","xmm2"); 1103 &pxor ("xmm3","xmm3"); 1104 &pxor ("xmm4","xmm4"); 1105 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1106 &pxor ("xmm5","xmm5"); 1107 &movdqa (&QWP(48,"esp"),"xmm0"); 1108 &pxor ("xmm6","xmm6"); 1109 &movdqa (&QWP(64,"esp"),"xmm0"); 1110 &pxor ("xmm7","xmm7"); 1111 &mov ("esp",&DWP(80,"esp")); 1112 &function_end("aesni_ctr32_encrypt_blocks"); 1113 1115 ###################################################################### 1116 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1117 # const AES_KEY *key1, const AES_KEY *key2 1118 # const unsigned char iv[16]); 1119 # 1120 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1121 1122 &function_begin("aesni_xts_encrypt"); 1123 &mov ($key,&wparam(4)); # key2 1124 &mov ($inp,&wparam(5)); # clear-text tweak 1125 1126 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1127 &movups ($inout0,&QWP(0,$inp)); 1128 if ($inline) 1129 { &aesni_inline_generate1("enc"); } 1130 else 1131 { &call ("_aesni_encrypt1"); } 1132 1133 &mov ($inp,&wparam(0)); 1134 &mov ($out,&wparam(1)); 1135 &mov ($len,&wparam(2)); 1136 &mov ($key,&wparam(3)); # key1 1137 1138 &mov ($key_,"esp"); 1139 &sub ("esp",16*7+8); 1140 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1141 &and ("esp",-16); # align stack 1142 1143 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1144 &mov (&DWP(16*6+4,"esp"),0); 1145 &mov (&DWP(16*6+8,"esp"),1); 1146 &mov (&DWP(16*6+12,"esp"),0); 1147 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1148 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1149 1150 &movdqa ($tweak,$inout0); 1151 &pxor ($twtmp,$twtmp); 1152 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1153 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1154 1155 &and ($len,-16); 1156 &mov ($key_,$key); # backup $key 1157 &mov ($rounds_,$rounds); # backup $rounds 1158 &sub ($len,16*6); 1159 &jc (&label("xts_enc_short")); 1160 1161 &shl ($rounds,4); 1162 &mov ($rounds_,16); 1163 &sub ($rounds_,$rounds); 1164 &lea ($key,&DWP(32,$key,$rounds)); 1165 &jmp (&label("xts_enc_loop6")); 1166 1167 &set_label("xts_enc_loop6",16); 1168 for ($i=0;$i<4;$i++) { 1169 &pshufd ($twres,$twtmp,0x13); 1170 &pxor ($twtmp,$twtmp); 1171 &movdqa (&QWP(16*$i,"esp"),$tweak); 1172 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1173 &pand ($twres,$twmask); # isolate carry and residue 1174 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1175 &pxor ($tweak,$twres); 1176 } 1177 &pshufd ($inout5,$twtmp,0x13); 1178 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1179 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1180 &$movekey ($rndkey0,&QWP(0,$key_)); 1181 &pand ($inout5,$twmask); # isolate carry and residue 1182 &movups ($inout0,&QWP(0,$inp)); # load input 1183 &pxor ($inout5,$tweak); 1184 1185 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1186 &mov ($rounds,$rounds_); # restore $rounds 1187 &movdqu ($inout1,&QWP(16*1,$inp)); 1188 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1189 &movdqu ($inout2,&QWP(16*2,$inp)); 1190 &pxor ($inout1,$rndkey0); 1191 &movdqu ($inout3,&QWP(16*3,$inp)); 1192 &pxor ($inout2,$rndkey0); 1193 &movdqu ($inout4,&QWP(16*4,$inp)); 1194 &pxor ($inout3,$rndkey0); 1195 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1196 &pxor ($inout4,$rndkey0); 1197 &lea ($inp,&DWP(16*6,$inp)); 1198 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1199 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1200 &pxor ($inout5,$rndkey1); 1201 1202 &$movekey ($rndkey1,&QWP(16,$key_)); 1203 &pxor ($inout1,&QWP(16*1,"esp")); 1204 &pxor ($inout2,&QWP(16*2,"esp")); 1205 &aesenc ($inout0,$rndkey1); 1206 &pxor ($inout3,&QWP(16*3,"esp")); 1207 &pxor ($inout4,&QWP(16*4,"esp")); 1208 &aesenc ($inout1,$rndkey1); 1209 &pxor ($inout5,$rndkey0); 1210 &$movekey ($rndkey0,&QWP(32,$key_)); 1211 &aesenc ($inout2,$rndkey1); 1212 &aesenc ($inout3,$rndkey1); 1213 &aesenc ($inout4,$rndkey1); 1214 &aesenc ($inout5,$rndkey1); 1215 &call (&label("_aesni_encrypt6_enter")); 1216 1217 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1218 &pxor ($twtmp,$twtmp); 1219 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1220 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1221 &xorps ($inout1,&QWP(16*1,"esp")); 1222 &movups (&QWP(16*0,$out),$inout0); # write output 1223 &xorps ($inout2,&QWP(16*2,"esp")); 1224 &movups (&QWP(16*1,$out),$inout1); 1225 &xorps ($inout3,&QWP(16*3,"esp")); 1226 &movups (&QWP(16*2,$out),$inout2); 1227 &xorps ($inout4,&QWP(16*4,"esp")); 1228 &movups (&QWP(16*3,$out),$inout3); 1229 &xorps ($inout5,$tweak); 1230 &movups (&QWP(16*4,$out),$inout4); 1231 &pshufd ($twres,$twtmp,0x13); 1232 &movups (&QWP(16*5,$out),$inout5); 1233 &lea ($out,&DWP(16*6,$out)); 1234 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1235 1236 &pxor ($twtmp,$twtmp); 1237 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1238 &pand ($twres,$twmask); # isolate carry and residue 1239 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1240 &pxor ($tweak,$twres); 1241 1242 &sub ($len,16*6); 1243 &jnc (&label("xts_enc_loop6")); 1244 1245 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1246 &mov ($key,$key_); # restore $key 1247 &mov ($rounds_,$rounds); 1248 1249 &set_label("xts_enc_short"); 1250 &add ($len,16*6); 1251 &jz (&label("xts_enc_done6x")); 1252 1253 &movdqa ($inout3,$tweak); # put aside previous tweak 1254 &cmp ($len,0x20); 1255 &jb (&label("xts_enc_one")); 1256 1257 &pshufd ($twres,$twtmp,0x13); 1258 &pxor ($twtmp,$twtmp); 1259 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1260 &pand ($twres,$twmask); # isolate carry and residue 1261 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1262 &pxor ($tweak,$twres); 1263 &je (&label("xts_enc_two")); 1264 1265 &pshufd ($twres,$twtmp,0x13); 1266 &pxor ($twtmp,$twtmp); 1267 &movdqa ($inout4,$tweak); # put aside previous tweak 1268 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1269 &pand ($twres,$twmask); # isolate carry and residue 1270 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1271 &pxor ($tweak,$twres); 1272 &cmp ($len,0x40); 1273 &jb (&label("xts_enc_three")); 1274 1275 &pshufd ($twres,$twtmp,0x13); 1276 &pxor ($twtmp,$twtmp); 1277 &movdqa ($inout5,$tweak); # put aside previous tweak 1278 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1279 &pand ($twres,$twmask); # isolate carry and residue 1280 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1281 &pxor ($tweak,$twres); 1282 &movdqa (&QWP(16*0,"esp"),$inout3); 1283 &movdqa (&QWP(16*1,"esp"),$inout4); 1284 &je (&label("xts_enc_four")); 1285 1286 &movdqa (&QWP(16*2,"esp"),$inout5); 1287 &pshufd ($inout5,$twtmp,0x13); 1288 &movdqa (&QWP(16*3,"esp"),$tweak); 1289 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1290 &pand ($inout5,$twmask); # isolate carry and residue 1291 &pxor ($inout5,$tweak); 1292 1293 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1294 &movdqu ($inout1,&QWP(16*1,$inp)); 1295 &movdqu ($inout2,&QWP(16*2,$inp)); 1296 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1297 &movdqu ($inout3,&QWP(16*3,$inp)); 1298 &pxor ($inout1,&QWP(16*1,"esp")); 1299 &movdqu ($inout4,&QWP(16*4,$inp)); 1300 &pxor ($inout2,&QWP(16*2,"esp")); 1301 &lea ($inp,&DWP(16*5,$inp)); 1302 &pxor ($inout3,&QWP(16*3,"esp")); 1303 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1304 &pxor ($inout4,$inout5); 1305 1306 &call ("_aesni_encrypt6"); 1307 1308 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1309 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1310 &xorps ($inout1,&QWP(16*1,"esp")); 1311 &xorps ($inout2,&QWP(16*2,"esp")); 1312 &movups (&QWP(16*0,$out),$inout0); # write output 1313 &xorps ($inout3,&QWP(16*3,"esp")); 1314 &movups (&QWP(16*1,$out),$inout1); 1315 &xorps ($inout4,$tweak); 1316 &movups (&QWP(16*2,$out),$inout2); 1317 &movups (&QWP(16*3,$out),$inout3); 1318 &movups (&QWP(16*4,$out),$inout4); 1319 &lea ($out,&DWP(16*5,$out)); 1320 &jmp (&label("xts_enc_done")); 1321 1322 &set_label("xts_enc_one",16); 1323 &movups ($inout0,&QWP(16*0,$inp)); # load input 1324 &lea ($inp,&DWP(16*1,$inp)); 1325 &xorps ($inout0,$inout3); # input^=tweak 1326 if ($inline) 1327 { &aesni_inline_generate1("enc"); } 1328 else 1329 { &call ("_aesni_encrypt1"); } 1330 &xorps ($inout0,$inout3); # output^=tweak 1331 &movups (&QWP(16*0,$out),$inout0); # write output 1332 &lea ($out,&DWP(16*1,$out)); 1333 1334 &movdqa ($tweak,$inout3); # last tweak 1335 &jmp (&label("xts_enc_done")); 1336 1337 &set_label("xts_enc_two",16); 1338 &movaps ($inout4,$tweak); # put aside last tweak 1339 1340 &movups ($inout0,&QWP(16*0,$inp)); # load input 1341 &movups ($inout1,&QWP(16*1,$inp)); 1342 &lea ($inp,&DWP(16*2,$inp)); 1343 &xorps ($inout0,$inout3); # input^=tweak 1344 &xorps ($inout1,$inout4); 1345 1346 &call ("_aesni_encrypt2"); 1347 1348 &xorps ($inout0,$inout3); # output^=tweak 1349 &xorps ($inout1,$inout4); 1350 &movups (&QWP(16*0,$out),$inout0); # write output 1351 &movups (&QWP(16*1,$out),$inout1); 1352 &lea ($out,&DWP(16*2,$out)); 1353 1354 &movdqa ($tweak,$inout4); # last tweak 1355 &jmp (&label("xts_enc_done")); 1356 1357 &set_label("xts_enc_three",16); 1358 &movaps ($inout5,$tweak); # put aside last tweak 1359 &movups ($inout0,&QWP(16*0,$inp)); # load input 1360 &movups ($inout1,&QWP(16*1,$inp)); 1361 &movups ($inout2,&QWP(16*2,$inp)); 1362 &lea ($inp,&DWP(16*3,$inp)); 1363 &xorps ($inout0,$inout3); # input^=tweak 1364 &xorps ($inout1,$inout4); 1365 &xorps ($inout2,$inout5); 1366 1367 &call ("_aesni_encrypt3"); 1368 1369 &xorps ($inout0,$inout3); # output^=tweak 1370 &xorps ($inout1,$inout4); 1371 &xorps ($inout2,$inout5); 1372 &movups (&QWP(16*0,$out),$inout0); # write output 1373 &movups (&QWP(16*1,$out),$inout1); 1374 &movups (&QWP(16*2,$out),$inout2); 1375 &lea ($out,&DWP(16*3,$out)); 1376 1377 &movdqa ($tweak,$inout5); # last tweak 1378 &jmp (&label("xts_enc_done")); 1379 1380 &set_label("xts_enc_four",16); 1381 &movaps ($inout4,$tweak); # put aside last tweak 1382 1383 &movups ($inout0,&QWP(16*0,$inp)); # load input 1384 &movups ($inout1,&QWP(16*1,$inp)); 1385 &movups ($inout2,&QWP(16*2,$inp)); 1386 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1387 &movups ($inout3,&QWP(16*3,$inp)); 1388 &lea ($inp,&DWP(16*4,$inp)); 1389 &xorps ($inout1,&QWP(16*1,"esp")); 1390 &xorps ($inout2,$inout5); 1391 &xorps ($inout3,$inout4); 1392 1393 &call ("_aesni_encrypt4"); 1394 1395 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1396 &xorps ($inout1,&QWP(16*1,"esp")); 1397 &xorps ($inout2,$inout5); 1398 &movups (&QWP(16*0,$out),$inout0); # write output 1399 &xorps ($inout3,$inout4); 1400 &movups (&QWP(16*1,$out),$inout1); 1401 &movups (&QWP(16*2,$out),$inout2); 1402 &movups (&QWP(16*3,$out),$inout3); 1403 &lea ($out,&DWP(16*4,$out)); 1404 1405 &movdqa ($tweak,$inout4); # last tweak 1406 &jmp (&label("xts_enc_done")); 1407 1408 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1409 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1410 &and ($len,15); 1411 &jz (&label("xts_enc_ret")); 1412 &movdqa ($inout3,$tweak); 1413 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1414 &jmp (&label("xts_enc_steal")); 1415 1416 &set_label("xts_enc_done",16); 1417 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1418 &pxor ($twtmp,$twtmp); 1419 &and ($len,15); 1420 &jz (&label("xts_enc_ret")); 1421 1422 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1423 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1424 &pshufd ($inout3,$twtmp,0x13); 1425 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1426 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1427 &pxor ($inout3,$tweak); 1428 1429 &set_label("xts_enc_steal"); 1430 &movz ($rounds,&BP(0,$inp)); 1431 &movz ($key,&BP(-16,$out)); 1432 &lea ($inp,&DWP(1,$inp)); 1433 &mov (&BP(-16,$out),&LB($rounds)); 1434 &mov (&BP(0,$out),&LB($key)); 1435 &lea ($out,&DWP(1,$out)); 1436 &sub ($len,1); 1437 &jnz (&label("xts_enc_steal")); 1438 1439 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1440 &mov ($key,$key_); # restore $key 1441 &mov ($rounds,$rounds_); # restore $rounds 1442 1443 &movups ($inout0,&QWP(-16,$out)); # load input 1444 &xorps ($inout0,$inout3); # input^=tweak 1445 if ($inline) 1446 { &aesni_inline_generate1("enc"); } 1447 else 1448 { &call ("_aesni_encrypt1"); } 1449 &xorps ($inout0,$inout3); # output^=tweak 1450 &movups (&QWP(-16,$out),$inout0); # write output 1451 1452 &set_label("xts_enc_ret"); 1453 &pxor ("xmm0","xmm0"); # clear register bank 1454 &pxor ("xmm1","xmm1"); 1455 &pxor ("xmm2","xmm2"); 1456 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1457 &pxor ("xmm3","xmm3"); 1458 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1459 &pxor ("xmm4","xmm4"); 1460 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1461 &pxor ("xmm5","xmm5"); 1462 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1463 &pxor ("xmm6","xmm6"); 1464 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1465 &pxor ("xmm7","xmm7"); 1466 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1467 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1468 &function_end("aesni_xts_encrypt"); 1469 1470 &function_begin("aesni_xts_decrypt"); 1471 &mov ($key,&wparam(4)); # key2 1472 &mov ($inp,&wparam(5)); # clear-text tweak 1473 1474 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1475 &movups ($inout0,&QWP(0,$inp)); 1476 if ($inline) 1477 { &aesni_inline_generate1("enc"); } 1478 else 1479 { &call ("_aesni_encrypt1"); } 1480 1481 &mov ($inp,&wparam(0)); 1482 &mov ($out,&wparam(1)); 1483 &mov ($len,&wparam(2)); 1484 &mov ($key,&wparam(3)); # key1 1485 1486 &mov ($key_,"esp"); 1487 &sub ("esp",16*7+8); 1488 &and ("esp",-16); # align stack 1489 1490 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1491 &test ($len,15); 1492 &setnz (&LB($rounds_)); 1493 &shl ($rounds_,4); 1494 &sub ($len,$rounds_); 1495 1496 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1497 &mov (&DWP(16*6+4,"esp"),0); 1498 &mov (&DWP(16*6+8,"esp"),1); 1499 &mov (&DWP(16*6+12,"esp"),0); 1500 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1501 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1502 1503 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1504 &mov ($key_,$key); # backup $key 1505 &mov ($rounds_,$rounds); # backup $rounds 1506 1507 &movdqa ($tweak,$inout0); 1508 &pxor ($twtmp,$twtmp); 1509 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1510 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1511 1512 &and ($len,-16); 1513 &sub ($len,16*6); 1514 &jc (&label("xts_dec_short")); 1515 1516 &shl ($rounds,4); 1517 &mov ($rounds_,16); 1518 &sub ($rounds_,$rounds); 1519 &lea ($key,&DWP(32,$key,$rounds)); 1520 &jmp (&label("xts_dec_loop6")); 1521 1522 &set_label("xts_dec_loop6",16); 1523 for ($i=0;$i<4;$i++) { 1524 &pshufd ($twres,$twtmp,0x13); 1525 &pxor ($twtmp,$twtmp); 1526 &movdqa (&QWP(16*$i,"esp"),$tweak); 1527 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1528 &pand ($twres,$twmask); # isolate carry and residue 1529 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1530 &pxor ($tweak,$twres); 1531 } 1532 &pshufd ($inout5,$twtmp,0x13); 1533 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1534 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1535 &$movekey ($rndkey0,&QWP(0,$key_)); 1536 &pand ($inout5,$twmask); # isolate carry and residue 1537 &movups ($inout0,&QWP(0,$inp)); # load input 1538 &pxor ($inout5,$tweak); 1539 1540 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1541 &mov ($rounds,$rounds_); 1542 &movdqu ($inout1,&QWP(16*1,$inp)); 1543 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1544 &movdqu ($inout2,&QWP(16*2,$inp)); 1545 &pxor ($inout1,$rndkey0); 1546 &movdqu ($inout3,&QWP(16*3,$inp)); 1547 &pxor ($inout2,$rndkey0); 1548 &movdqu ($inout4,&QWP(16*4,$inp)); 1549 &pxor ($inout3,$rndkey0); 1550 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1551 &pxor ($inout4,$rndkey0); 1552 &lea ($inp,&DWP(16*6,$inp)); 1553 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1554 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1555 &pxor ($inout5,$rndkey1); 1556 1557 &$movekey ($rndkey1,&QWP(16,$key_)); 1558 &pxor ($inout1,&QWP(16*1,"esp")); 1559 &pxor ($inout2,&QWP(16*2,"esp")); 1560 &aesdec ($inout0,$rndkey1); 1561 &pxor ($inout3,&QWP(16*3,"esp")); 1562 &pxor ($inout4,&QWP(16*4,"esp")); 1563 &aesdec ($inout1,$rndkey1); 1564 &pxor ($inout5,$rndkey0); 1565 &$movekey ($rndkey0,&QWP(32,$key_)); 1566 &aesdec ($inout2,$rndkey1); 1567 &aesdec ($inout3,$rndkey1); 1568 &aesdec ($inout4,$rndkey1); 1569 &aesdec ($inout5,$rndkey1); 1570 &call (&label("_aesni_decrypt6_enter")); 1571 1572 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1573 &pxor ($twtmp,$twtmp); 1574 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1575 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1576 &xorps ($inout1,&QWP(16*1,"esp")); 1577 &movups (&QWP(16*0,$out),$inout0); # write output 1578 &xorps ($inout2,&QWP(16*2,"esp")); 1579 &movups (&QWP(16*1,$out),$inout1); 1580 &xorps ($inout3,&QWP(16*3,"esp")); 1581 &movups (&QWP(16*2,$out),$inout2); 1582 &xorps ($inout4,&QWP(16*4,"esp")); 1583 &movups (&QWP(16*3,$out),$inout3); 1584 &xorps ($inout5,$tweak); 1585 &movups (&QWP(16*4,$out),$inout4); 1586 &pshufd ($twres,$twtmp,0x13); 1587 &movups (&QWP(16*5,$out),$inout5); 1588 &lea ($out,&DWP(16*6,$out)); 1589 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1590 1591 &pxor ($twtmp,$twtmp); 1592 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1593 &pand ($twres,$twmask); # isolate carry and residue 1594 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1595 &pxor ($tweak,$twres); 1596 1597 &sub ($len,16*6); 1598 &jnc (&label("xts_dec_loop6")); 1599 1600 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1601 &mov ($key,$key_); # restore $key 1602 &mov ($rounds_,$rounds); 1603 1604 &set_label("xts_dec_short"); 1605 &add ($len,16*6); 1606 &jz (&label("xts_dec_done6x")); 1607 1608 &movdqa ($inout3,$tweak); # put aside previous tweak 1609 &cmp ($len,0x20); 1610 &jb (&label("xts_dec_one")); 1611 1612 &pshufd ($twres,$twtmp,0x13); 1613 &pxor ($twtmp,$twtmp); 1614 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1615 &pand ($twres,$twmask); # isolate carry and residue 1616 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1617 &pxor ($tweak,$twres); 1618 &je (&label("xts_dec_two")); 1619 1620 &pshufd ($twres,$twtmp,0x13); 1621 &pxor ($twtmp,$twtmp); 1622 &movdqa ($inout4,$tweak); # put aside previous tweak 1623 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1624 &pand ($twres,$twmask); # isolate carry and residue 1625 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1626 &pxor ($tweak,$twres); 1627 &cmp ($len,0x40); 1628 &jb (&label("xts_dec_three")); 1629 1630 &pshufd ($twres,$twtmp,0x13); 1631 &pxor ($twtmp,$twtmp); 1632 &movdqa ($inout5,$tweak); # put aside previous tweak 1633 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1634 &pand ($twres,$twmask); # isolate carry and residue 1635 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1636 &pxor ($tweak,$twres); 1637 &movdqa (&QWP(16*0,"esp"),$inout3); 1638 &movdqa (&QWP(16*1,"esp"),$inout4); 1639 &je (&label("xts_dec_four")); 1640 1641 &movdqa (&QWP(16*2,"esp"),$inout5); 1642 &pshufd ($inout5,$twtmp,0x13); 1643 &movdqa (&QWP(16*3,"esp"),$tweak); 1644 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1645 &pand ($inout5,$twmask); # isolate carry and residue 1646 &pxor ($inout5,$tweak); 1647 1648 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1649 &movdqu ($inout1,&QWP(16*1,$inp)); 1650 &movdqu ($inout2,&QWP(16*2,$inp)); 1651 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1652 &movdqu ($inout3,&QWP(16*3,$inp)); 1653 &pxor ($inout1,&QWP(16*1,"esp")); 1654 &movdqu ($inout4,&QWP(16*4,$inp)); 1655 &pxor ($inout2,&QWP(16*2,"esp")); 1656 &lea ($inp,&DWP(16*5,$inp)); 1657 &pxor ($inout3,&QWP(16*3,"esp")); 1658 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1659 &pxor ($inout4,$inout5); 1660 1661 &call ("_aesni_decrypt6"); 1662 1663 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1664 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1665 &xorps ($inout1,&QWP(16*1,"esp")); 1666 &xorps ($inout2,&QWP(16*2,"esp")); 1667 &movups (&QWP(16*0,$out),$inout0); # write output 1668 &xorps ($inout3,&QWP(16*3,"esp")); 1669 &movups (&QWP(16*1,$out),$inout1); 1670 &xorps ($inout4,$tweak); 1671 &movups (&QWP(16*2,$out),$inout2); 1672 &movups (&QWP(16*3,$out),$inout3); 1673 &movups (&QWP(16*4,$out),$inout4); 1674 &lea ($out,&DWP(16*5,$out)); 1675 &jmp (&label("xts_dec_done")); 1676 1677 &set_label("xts_dec_one",16); 1678 &movups ($inout0,&QWP(16*0,$inp)); # load input 1679 &lea ($inp,&DWP(16*1,$inp)); 1680 &xorps ($inout0,$inout3); # input^=tweak 1681 if ($inline) 1682 { &aesni_inline_generate1("dec"); } 1683 else 1684 { &call ("_aesni_decrypt1"); } 1685 &xorps ($inout0,$inout3); # output^=tweak 1686 &movups (&QWP(16*0,$out),$inout0); # write output 1687 &lea ($out,&DWP(16*1,$out)); 1688 1689 &movdqa ($tweak,$inout3); # last tweak 1690 &jmp (&label("xts_dec_done")); 1691 1692 &set_label("xts_dec_two",16); 1693 &movaps ($inout4,$tweak); # put aside last tweak 1694 1695 &movups ($inout0,&QWP(16*0,$inp)); # load input 1696 &movups ($inout1,&QWP(16*1,$inp)); 1697 &lea ($inp,&DWP(16*2,$inp)); 1698 &xorps ($inout0,$inout3); # input^=tweak 1699 &xorps ($inout1,$inout4); 1700 1701 &call ("_aesni_decrypt2"); 1702 1703 &xorps ($inout0,$inout3); # output^=tweak 1704 &xorps ($inout1,$inout4); 1705 &movups (&QWP(16*0,$out),$inout0); # write output 1706 &movups (&QWP(16*1,$out),$inout1); 1707 &lea ($out,&DWP(16*2,$out)); 1708 1709 &movdqa ($tweak,$inout4); # last tweak 1710 &jmp (&label("xts_dec_done")); 1711 1712 &set_label("xts_dec_three",16); 1713 &movaps ($inout5,$tweak); # put aside last tweak 1714 &movups ($inout0,&QWP(16*0,$inp)); # load input 1715 &movups ($inout1,&QWP(16*1,$inp)); 1716 &movups ($inout2,&QWP(16*2,$inp)); 1717 &lea ($inp,&DWP(16*3,$inp)); 1718 &xorps ($inout0,$inout3); # input^=tweak 1719 &xorps ($inout1,$inout4); 1720 &xorps ($inout2,$inout5); 1721 1722 &call ("_aesni_decrypt3"); 1723 1724 &xorps ($inout0,$inout3); # output^=tweak 1725 &xorps ($inout1,$inout4); 1726 &xorps ($inout2,$inout5); 1727 &movups (&QWP(16*0,$out),$inout0); # write output 1728 &movups (&QWP(16*1,$out),$inout1); 1729 &movups (&QWP(16*2,$out),$inout2); 1730 &lea ($out,&DWP(16*3,$out)); 1731 1732 &movdqa ($tweak,$inout5); # last tweak 1733 &jmp (&label("xts_dec_done")); 1734 1735 &set_label("xts_dec_four",16); 1736 &movaps ($inout4,$tweak); # put aside last tweak 1737 1738 &movups ($inout0,&QWP(16*0,$inp)); # load input 1739 &movups ($inout1,&QWP(16*1,$inp)); 1740 &movups ($inout2,&QWP(16*2,$inp)); 1741 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1742 &movups ($inout3,&QWP(16*3,$inp)); 1743 &lea ($inp,&DWP(16*4,$inp)); 1744 &xorps ($inout1,&QWP(16*1,"esp")); 1745 &xorps ($inout2,$inout5); 1746 &xorps ($inout3,$inout4); 1747 1748 &call ("_aesni_decrypt4"); 1749 1750 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1751 &xorps ($inout1,&QWP(16*1,"esp")); 1752 &xorps ($inout2,$inout5); 1753 &movups (&QWP(16*0,$out),$inout0); # write output 1754 &xorps ($inout3,$inout4); 1755 &movups (&QWP(16*1,$out),$inout1); 1756 &movups (&QWP(16*2,$out),$inout2); 1757 &movups (&QWP(16*3,$out),$inout3); 1758 &lea ($out,&DWP(16*4,$out)); 1759 1760 &movdqa ($tweak,$inout4); # last tweak 1761 &jmp (&label("xts_dec_done")); 1762 1763 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1764 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1765 &and ($len,15); 1766 &jz (&label("xts_dec_ret")); 1767 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1768 &jmp (&label("xts_dec_only_one_more")); 1769 1770 &set_label("xts_dec_done",16); 1771 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1772 &pxor ($twtmp,$twtmp); 1773 &and ($len,15); 1774 &jz (&label("xts_dec_ret")); 1775 1776 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1777 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1778 &pshufd ($twres,$twtmp,0x13); 1779 &pxor ($twtmp,$twtmp); 1780 &movdqa ($twmask,&QWP(16*6,"esp")); 1781 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1782 &pand ($twres,$twmask); # isolate carry and residue 1783 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1784 &pxor ($tweak,$twres); 1785 1786 &set_label("xts_dec_only_one_more"); 1787 &pshufd ($inout3,$twtmp,0x13); 1788 &movdqa ($inout4,$tweak); # put aside previous tweak 1789 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1790 &pand ($inout3,$twmask); # isolate carry and residue 1791 &pxor ($inout3,$tweak); 1792 1793 &mov ($key,$key_); # restore $key 1794 &mov ($rounds,$rounds_); # restore $rounds 1795 1796 &movups ($inout0,&QWP(0,$inp)); # load input 1797 &xorps ($inout0,$inout3); # input^=tweak 1798 if ($inline) 1799 { &aesni_inline_generate1("dec"); } 1800 else 1801 { &call ("_aesni_decrypt1"); } 1802 &xorps ($inout0,$inout3); # output^=tweak 1803 &movups (&QWP(0,$out),$inout0); # write output 1804 1805 &set_label("xts_dec_steal"); 1806 &movz ($rounds,&BP(16,$inp)); 1807 &movz ($key,&BP(0,$out)); 1808 &lea ($inp,&DWP(1,$inp)); 1809 &mov (&BP(0,$out),&LB($rounds)); 1810 &mov (&BP(16,$out),&LB($key)); 1811 &lea ($out,&DWP(1,$out)); 1812 &sub ($len,1); 1813 &jnz (&label("xts_dec_steal")); 1814 1815 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1816 &mov ($key,$key_); # restore $key 1817 &mov ($rounds,$rounds_); # restore $rounds 1818 1819 &movups ($inout0,&QWP(0,$out)); # load input 1820 &xorps ($inout0,$inout4); # input^=tweak 1821 if ($inline) 1822 { &aesni_inline_generate1("dec"); } 1823 else 1824 { &call ("_aesni_decrypt1"); } 1825 &xorps ($inout0,$inout4); # output^=tweak 1826 &movups (&QWP(0,$out),$inout0); # write output 1827 1828 &set_label("xts_dec_ret"); 1829 &pxor ("xmm0","xmm0"); # clear register bank 1830 &pxor ("xmm1","xmm1"); 1831 &pxor ("xmm2","xmm2"); 1832 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1833 &pxor ("xmm3","xmm3"); 1834 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1835 &pxor ("xmm4","xmm4"); 1836 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1837 &pxor ("xmm5","xmm5"); 1838 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1839 &pxor ("xmm6","xmm6"); 1840 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1841 &pxor ("xmm7","xmm7"); 1842 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1843 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1844 &function_end("aesni_xts_decrypt"); 1845 } 1846 } 1847 1849 ###################################################################### 1850 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 1851 # size_t length, const AES_KEY *key, 1852 # unsigned char *ivp,const int enc); 1853 &function_begin("${PREFIX}_cbc_encrypt"); 1854 &mov ($inp,&wparam(0)); 1855 &mov ($rounds_,"esp"); 1856 &mov ($out,&wparam(1)); 1857 &sub ($rounds_,24); 1858 &mov ($len,&wparam(2)); 1859 &and ($rounds_,-16); 1860 &mov ($key,&wparam(3)); 1861 &mov ($key_,&wparam(4)); 1862 &test ($len,$len); 1863 &jz (&label("cbc_abort")); 1864 1865 &cmp (&wparam(5),0); 1866 &xchg ($rounds_,"esp"); # alloca 1867 &movups ($ivec,&QWP(0,$key_)); # load IV 1868 &mov ($rounds,&DWP(240,$key)); 1869 &mov ($key_,$key); # backup $key 1870 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1871 &mov ($rounds_,$rounds); # backup $rounds 1872 &je (&label("cbc_decrypt")); 1873 1874 &movaps ($inout0,$ivec); 1875 &cmp ($len,16); 1876 &jb (&label("cbc_enc_tail")); 1877 &sub ($len,16); 1878 &jmp (&label("cbc_enc_loop")); 1879 1880 &set_label("cbc_enc_loop",16); 1881 &movups ($ivec,&QWP(0,$inp)); # input actually 1882 &lea ($inp,&DWP(16,$inp)); 1883 if ($inline) 1884 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1885 else 1886 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1887 &mov ($rounds,$rounds_); # restore $rounds 1888 &mov ($key,$key_); # restore $key 1889 &movups (&QWP(0,$out),$inout0); # store output 1890 &lea ($out,&DWP(16,$out)); 1891 &sub ($len,16); 1892 &jnc (&label("cbc_enc_loop")); 1893 &add ($len,16); 1894 &jnz (&label("cbc_enc_tail")); 1895 &movaps ($ivec,$inout0); 1896 &pxor ($inout0,$inout0); 1897 &jmp (&label("cbc_ret")); 1898 1899 &set_label("cbc_enc_tail"); 1900 &mov ("ecx",$len); # zaps $rounds 1901 &data_word(0xA4F3F689); # rep movsb 1902 &mov ("ecx",16); # zero tail 1903 &sub ("ecx",$len); 1904 &xor ("eax","eax"); # zaps $len 1905 &data_word(0xAAF3F689); # rep stosb 1906 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1907 &mov ($rounds,$rounds_); # restore $rounds 1908 &mov ($inp,$out); # $inp and $out are the same 1909 &mov ($key,$key_); # restore $key 1910 &jmp (&label("cbc_enc_loop")); 1911 ###################################################################### 1912 &set_label("cbc_decrypt",16); 1913 &cmp ($len,0x50); 1914 &jbe (&label("cbc_dec_tail")); 1915 &movaps (&QWP(0,"esp"),$ivec); # save IV 1916 &sub ($len,0x50); 1917 &jmp (&label("cbc_dec_loop6_enter")); 1918 1919 &set_label("cbc_dec_loop6",16); 1920 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1921 &movups (&QWP(0,$out),$inout5); 1922 &lea ($out,&DWP(0x10,$out)); 1923 &set_label("cbc_dec_loop6_enter"); 1924 &movdqu ($inout0,&QWP(0,$inp)); 1925 &movdqu ($inout1,&QWP(0x10,$inp)); 1926 &movdqu ($inout2,&QWP(0x20,$inp)); 1927 &movdqu ($inout3,&QWP(0x30,$inp)); 1928 &movdqu ($inout4,&QWP(0x40,$inp)); 1929 &movdqu ($inout5,&QWP(0x50,$inp)); 1930 1931 &call ("_aesni_decrypt6"); 1932 1933 &movups ($rndkey1,&QWP(0,$inp)); 1934 &movups ($rndkey0,&QWP(0x10,$inp)); 1935 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1936 &xorps ($inout1,$rndkey1); 1937 &movups ($rndkey1,&QWP(0x20,$inp)); 1938 &xorps ($inout2,$rndkey0); 1939 &movups ($rndkey0,&QWP(0x30,$inp)); 1940 &xorps ($inout3,$rndkey1); 1941 &movups ($rndkey1,&QWP(0x40,$inp)); 1942 &xorps ($inout4,$rndkey0); 1943 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1944 &xorps ($inout5,$rndkey1); 1945 &movups (&QWP(0,$out),$inout0); 1946 &movups (&QWP(0x10,$out),$inout1); 1947 &lea ($inp,&DWP(0x60,$inp)); 1948 &movups (&QWP(0x20,$out),$inout2); 1949 &mov ($rounds,$rounds_); # restore $rounds 1950 &movups (&QWP(0x30,$out),$inout3); 1951 &mov ($key,$key_); # restore $key 1952 &movups (&QWP(0x40,$out),$inout4); 1953 &lea ($out,&DWP(0x50,$out)); 1954 &sub ($len,0x60); 1955 &ja (&label("cbc_dec_loop6")); 1956 1957 &movaps ($inout0,$inout5); 1958 &movaps ($ivec,$rndkey0); 1959 &add ($len,0x50); 1960 &jle (&label("cbc_dec_clear_tail_collected")); 1961 &movups (&QWP(0,$out),$inout0); 1962 &lea ($out,&DWP(0x10,$out)); 1963 &set_label("cbc_dec_tail"); 1964 &movups ($inout0,&QWP(0,$inp)); 1965 &movaps ($in0,$inout0); 1966 &cmp ($len,0x10); 1967 &jbe (&label("cbc_dec_one")); 1968 1969 &movups ($inout1,&QWP(0x10,$inp)); 1970 &movaps ($in1,$inout1); 1971 &cmp ($len,0x20); 1972 &jbe (&label("cbc_dec_two")); 1973 1974 &movups ($inout2,&QWP(0x20,$inp)); 1975 &cmp ($len,0x30); 1976 &jbe (&label("cbc_dec_three")); 1977 1978 &movups ($inout3,&QWP(0x30,$inp)); 1979 &cmp ($len,0x40); 1980 &jbe (&label("cbc_dec_four")); 1981 1982 &movups ($inout4,&QWP(0x40,$inp)); 1983 &movaps (&QWP(0,"esp"),$ivec); # save IV 1984 &movups ($inout0,&QWP(0,$inp)); 1985 &xorps ($inout5,$inout5); 1986 &call ("_aesni_decrypt6"); 1987 &movups ($rndkey1,&QWP(0,$inp)); 1988 &movups ($rndkey0,&QWP(0x10,$inp)); 1989 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1990 &xorps ($inout1,$rndkey1); 1991 &movups ($rndkey1,&QWP(0x20,$inp)); 1992 &xorps ($inout2,$rndkey0); 1993 &movups ($rndkey0,&QWP(0x30,$inp)); 1994 &xorps ($inout3,$rndkey1); 1995 &movups ($ivec,&QWP(0x40,$inp)); # IV 1996 &xorps ($inout4,$rndkey0); 1997 &movups (&QWP(0,$out),$inout0); 1998 &movups (&QWP(0x10,$out),$inout1); 1999 &pxor ($inout1,$inout1); 2000 &movups (&QWP(0x20,$out),$inout2); 2001 &pxor ($inout2,$inout2); 2002 &movups (&QWP(0x30,$out),$inout3); 2003 &pxor ($inout3,$inout3); 2004 &lea ($out,&DWP(0x40,$out)); 2005 &movaps ($inout0,$inout4); 2006 &pxor ($inout4,$inout4); 2007 &sub ($len,0x50); 2008 &jmp (&label("cbc_dec_tail_collected")); 2009 2010 &set_label("cbc_dec_one",16); 2011 if ($inline) 2012 { &aesni_inline_generate1("dec"); } 2013 else 2014 { &call ("_aesni_decrypt1"); } 2015 &xorps ($inout0,$ivec); 2016 &movaps ($ivec,$in0); 2017 &sub ($len,0x10); 2018 &jmp (&label("cbc_dec_tail_collected")); 2019 2020 &set_label("cbc_dec_two",16); 2021 &call ("_aesni_decrypt2"); 2022 &xorps ($inout0,$ivec); 2023 &xorps ($inout1,$in0); 2024 &movups (&QWP(0,$out),$inout0); 2025 &movaps ($inout0,$inout1); 2026 &pxor ($inout1,$inout1); 2027 &lea ($out,&DWP(0x10,$out)); 2028 &movaps ($ivec,$in1); 2029 &sub ($len,0x20); 2030 &jmp (&label("cbc_dec_tail_collected")); 2031 2032 &set_label("cbc_dec_three",16); 2033 &call ("_aesni_decrypt3"); 2034 &xorps ($inout0,$ivec); 2035 &xorps ($inout1,$in0); 2036 &xorps ($inout2,$in1); 2037 &movups (&QWP(0,$out),$inout0); 2038 &movaps ($inout0,$inout2); 2039 &pxor ($inout2,$inout2); 2040 &movups (&QWP(0x10,$out),$inout1); 2041 &pxor ($inout1,$inout1); 2042 &lea ($out,&DWP(0x20,$out)); 2043 &movups ($ivec,&QWP(0x20,$inp)); 2044 &sub ($len,0x30); 2045 &jmp (&label("cbc_dec_tail_collected")); 2046 2047 &set_label("cbc_dec_four",16); 2048 &call ("_aesni_decrypt4"); 2049 &movups ($rndkey1,&QWP(0x10,$inp)); 2050 &movups ($rndkey0,&QWP(0x20,$inp)); 2051 &xorps ($inout0,$ivec); 2052 &movups ($ivec,&QWP(0x30,$inp)); 2053 &xorps ($inout1,$in0); 2054 &movups (&QWP(0,$out),$inout0); 2055 &xorps ($inout2,$rndkey1); 2056 &movups (&QWP(0x10,$out),$inout1); 2057 &pxor ($inout1,$inout1); 2058 &xorps ($inout3,$rndkey0); 2059 &movups (&QWP(0x20,$out),$inout2); 2060 &pxor ($inout2,$inout2); 2061 &lea ($out,&DWP(0x30,$out)); 2062 &movaps ($inout0,$inout3); 2063 &pxor ($inout3,$inout3); 2064 &sub ($len,0x40); 2065 &jmp (&label("cbc_dec_tail_collected")); 2066 2067 &set_label("cbc_dec_clear_tail_collected",16); 2068 &pxor ($inout1,$inout1); 2069 &pxor ($inout2,$inout2); 2070 &pxor ($inout3,$inout3); 2071 &pxor ($inout4,$inout4); 2072 &set_label("cbc_dec_tail_collected"); 2073 &and ($len,15); 2074 &jnz (&label("cbc_dec_tail_partial")); 2075 &movups (&QWP(0,$out),$inout0); 2076 &pxor ($rndkey0,$rndkey0); 2077 &jmp (&label("cbc_ret")); 2078 2079 &set_label("cbc_dec_tail_partial",16); 2080 &movaps (&QWP(0,"esp"),$inout0); 2081 &pxor ($rndkey0,$rndkey0); 2082 &mov ("ecx",16); 2083 &mov ($inp,"esp"); 2084 &sub ("ecx",$len); 2085 &data_word(0xA4F3F689); # rep movsb 2086 &movdqa (&QWP(0,"esp"),$inout0); 2087 2088 &set_label("cbc_ret"); 2089 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2090 &mov ($key_,&wparam(4)); 2091 &pxor ($inout0,$inout0); 2092 &pxor ($rndkey1,$rndkey1); 2093 &movups (&QWP(0,$key_),$ivec); # output IV 2094 &pxor ($ivec,$ivec); 2095 &set_label("cbc_abort"); 2096 &function_end("${PREFIX}_cbc_encrypt"); 2097 2099 ###################################################################### 2100 # Mechanical port from aesni-x86_64.pl. 2101 # 2102 # _aesni_set_encrypt_key is private interface, 2103 # input: 2104 # "eax" const unsigned char *userKey 2105 # $rounds int bits 2106 # $key AES_KEY *key 2107 # output: 2108 # "eax" return code 2109 # $round rounds 2110 2111 &function_begin_B("_aesni_set_encrypt_key"); 2112 &push ("ebp"); 2113 &push ("ebx"); 2114 &test ("eax","eax"); 2115 &jz (&label("bad_pointer")); 2116 &test ($key,$key); 2117 &jz (&label("bad_pointer")); 2118 2119 &call (&label("pic")); 2120 &set_label("pic"); 2121 &blindpop("ebx"); 2122 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2123 2124 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2125 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2126 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2127 &mov ("ebp",&DWP(4,"ebp")); 2128 &lea ($key,&DWP(16,$key)); 2129 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 2130 &cmp ($rounds,256); 2131 &je (&label("14rounds")); 2132 &cmp ($rounds,192); 2133 &je (&label("12rounds")); 2134 &cmp ($rounds,128); 2135 &jne (&label("bad_keybits")); 2136 2137 &set_label("10rounds",16); 2138 &cmp ("ebp",1<<28); 2139 &je (&label("10rounds_alt")); 2140 2141 &mov ($rounds,9); 2142 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2143 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 2144 &call (&label("key_128_cold")); 2145 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 2146 &call (&label("key_128")); 2147 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 2148 &call (&label("key_128")); 2149 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 2150 &call (&label("key_128")); 2151 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 2152 &call (&label("key_128")); 2153 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 2154 &call (&label("key_128")); 2155 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 2156 &call (&label("key_128")); 2157 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2158 &call (&label("key_128")); 2159 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2160 &call (&label("key_128")); 2161 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2162 &call (&label("key_128")); 2163 &$movekey (&QWP(0,$key),"xmm0"); 2164 &mov (&DWP(80,$key),$rounds); 2165 2166 &jmp (&label("good_key")); 2167 2168 &set_label("key_128",16); 2169 &$movekey (&QWP(0,$key),"xmm0"); 2170 &lea ($key,&DWP(16,$key)); 2171 &set_label("key_128_cold"); 2172 &shufps ("xmm4","xmm0",0b00010000); 2173 &xorps ("xmm0","xmm4"); 2174 &shufps ("xmm4","xmm0",0b10001100); 2175 &xorps ("xmm0","xmm4"); 2176 &shufps ("xmm1","xmm1",0b11111111); # critical path 2177 &xorps ("xmm0","xmm1"); 2178 &ret(); 2179 2180 &set_label("10rounds_alt",16); 2181 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2182 &mov ($rounds,8); 2183 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2184 &movdqa ("xmm2","xmm0"); 2185 &movdqu (&QWP(-16,$key),"xmm0"); 2186 2187 &set_label("loop_key128"); 2188 &pshufb ("xmm0","xmm5"); 2189 &aesenclast ("xmm0","xmm4"); 2190 &pslld ("xmm4",1); 2191 &lea ($key,&DWP(16,$key)); 2192 2193 &movdqa ("xmm3","xmm2"); 2194 &pslldq ("xmm2",4); 2195 &pxor ("xmm3","xmm2"); 2196 &pslldq ("xmm2",4); 2197 &pxor ("xmm3","xmm2"); 2198 &pslldq ("xmm2",4); 2199 &pxor ("xmm2","xmm3"); 2200 2201 &pxor ("xmm0","xmm2"); 2202 &movdqu (&QWP(-16,$key),"xmm0"); 2203 &movdqa ("xmm2","xmm0"); 2204 2205 &dec ($rounds); 2206 &jnz (&label("loop_key128")); 2207 2208 &movdqa ("xmm4",&QWP(0x30,"ebx")); 2209 2210 &pshufb ("xmm0","xmm5"); 2211 &aesenclast ("xmm0","xmm4"); 2212 &pslld ("xmm4",1); 2213 2214 &movdqa ("xmm3","xmm2"); 2215 &pslldq ("xmm2",4); 2216 &pxor ("xmm3","xmm2"); 2217 &pslldq ("xmm2",4); 2218 &pxor ("xmm3","xmm2"); 2219 &pslldq ("xmm2",4); 2220 &pxor ("xmm2","xmm3"); 2221 2222 &pxor ("xmm0","xmm2"); 2223 &movdqu (&QWP(0,$key),"xmm0"); 2224 2225 &movdqa ("xmm2","xmm0"); 2226 &pshufb ("xmm0","xmm5"); 2227 &aesenclast ("xmm0","xmm4"); 2228 2229 &movdqa ("xmm3","xmm2"); 2230 &pslldq ("xmm2",4); 2231 &pxor ("xmm3","xmm2"); 2232 &pslldq ("xmm2",4); 2233 &pxor ("xmm3","xmm2"); 2234 &pslldq ("xmm2",4); 2235 &pxor ("xmm2","xmm3"); 2236 2237 &pxor ("xmm0","xmm2"); 2238 &movdqu (&QWP(16,$key),"xmm0"); 2239 2240 &mov ($rounds,9); 2241 &mov (&DWP(96,$key),$rounds); 2242 2243 &jmp (&label("good_key")); 2244 2245 &set_label("12rounds",16); 2246 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2247 &cmp ("ebp",1<<28); 2248 &je (&label("12rounds_alt")); 2249 2250 &mov ($rounds,11); 2251 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2252 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2253 &call (&label("key_192a_cold")); 2254 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2255 &call (&label("key_192b")); 2256 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2257 &call (&label("key_192a")); 2258 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2259 &call (&label("key_192b")); 2260 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2261 &call (&label("key_192a")); 2262 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2263 &call (&label("key_192b")); 2264 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2265 &call (&label("key_192a")); 2266 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2267 &call (&label("key_192b")); 2268 &$movekey (&QWP(0,$key),"xmm0"); 2269 &mov (&DWP(48,$key),$rounds); 2270 2271 &jmp (&label("good_key")); 2272 2273 &set_label("key_192a",16); 2274 &$movekey (&QWP(0,$key),"xmm0"); 2275 &lea ($key,&DWP(16,$key)); 2276 &set_label("key_192a_cold",16); 2277 &movaps ("xmm5","xmm2"); 2278 &set_label("key_192b_warm"); 2279 &shufps ("xmm4","xmm0",0b00010000); 2280 &movdqa ("xmm3","xmm2"); 2281 &xorps ("xmm0","xmm4"); 2282 &shufps ("xmm4","xmm0",0b10001100); 2283 &pslldq ("xmm3",4); 2284 &xorps ("xmm0","xmm4"); 2285 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2286 &pxor ("xmm2","xmm3"); 2287 &pxor ("xmm0","xmm1"); 2288 &pshufd ("xmm3","xmm0",0b11111111); 2289 &pxor ("xmm2","xmm3"); 2290 &ret(); 2291 2292 &set_label("key_192b",16); 2293 &movaps ("xmm3","xmm0"); 2294 &shufps ("xmm5","xmm0",0b01000100); 2295 &$movekey (&QWP(0,$key),"xmm5"); 2296 &shufps ("xmm3","xmm2",0b01001110); 2297 &$movekey (&QWP(16,$key),"xmm3"); 2298 &lea ($key,&DWP(32,$key)); 2299 &jmp (&label("key_192b_warm")); 2300 2301 &set_label("12rounds_alt",16); 2302 &movdqa ("xmm5",&QWP(0x10,"ebx")); 2303 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2304 &mov ($rounds,8); 2305 &movdqu (&QWP(-16,$key),"xmm0"); 2306 2307 &set_label("loop_key192"); 2308 &movq (&QWP(0,$key),"xmm2"); 2309 &movdqa ("xmm1","xmm2"); 2310 &pshufb ("xmm2","xmm5"); 2311 &aesenclast ("xmm2","xmm4"); 2312 &pslld ("xmm4",1); 2313 &lea ($key,&DWP(24,$key)); 2314 2315 &movdqa ("xmm3","xmm0"); 2316 &pslldq ("xmm0",4); 2317 &pxor ("xmm3","xmm0"); 2318 &pslldq ("xmm0",4); 2319 &pxor ("xmm3","xmm0"); 2320 &pslldq ("xmm0",4); 2321 &pxor ("xmm0","xmm3"); 2322 2323 &pshufd ("xmm3","xmm0",0xff); 2324 &pxor ("xmm3","xmm1"); 2325 &pslldq ("xmm1",4); 2326 &pxor ("xmm3","xmm1"); 2327 2328 &pxor ("xmm0","xmm2"); 2329 &pxor ("xmm2","xmm3"); 2330 &movdqu (&QWP(-16,$key),"xmm0"); 2331 2332 &dec ($rounds); 2333 &jnz (&label("loop_key192")); 2334 2335 &mov ($rounds,11); 2336 &mov (&DWP(32,$key),$rounds); 2337 2338 &jmp (&label("good_key")); 2339 2340 &set_label("14rounds",16); 2341 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2342 &lea ($key,&DWP(16,$key)); 2343 &cmp ("ebp",1<<28); 2344 &je (&label("14rounds_alt")); 2345 2346 &mov ($rounds,13); 2347 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2348 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2349 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2350 &call (&label("key_256a_cold")); 2351 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2352 &call (&label("key_256b")); 2353 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2354 &call (&label("key_256a")); 2355 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2356 &call (&label("key_256b")); 2357 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2358 &call (&label("key_256a")); 2359 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2360 &call (&label("key_256b")); 2361 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2362 &call (&label("key_256a")); 2363 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2364 &call (&label("key_256b")); 2365 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2366 &call (&label("key_256a")); 2367 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2368 &call (&label("key_256b")); 2369 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2370 &call (&label("key_256a")); 2371 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2372 &call (&label("key_256b")); 2373 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2374 &call (&label("key_256a")); 2375 &$movekey (&QWP(0,$key),"xmm0"); 2376 &mov (&DWP(16,$key),$rounds); 2377 &xor ("eax","eax"); 2378 2379 &jmp (&label("good_key")); 2380 2381 &set_label("key_256a",16); 2382 &$movekey (&QWP(0,$key),"xmm2"); 2383 &lea ($key,&DWP(16,$key)); 2384 &set_label("key_256a_cold"); 2385 &shufps ("xmm4","xmm0",0b00010000); 2386 &xorps ("xmm0","xmm4"); 2387 &shufps ("xmm4","xmm0",0b10001100); 2388 &xorps ("xmm0","xmm4"); 2389 &shufps ("xmm1","xmm1",0b11111111); # critical path 2390 &xorps ("xmm0","xmm1"); 2391 &ret(); 2392 2393 &set_label("key_256b",16); 2394 &$movekey (&QWP(0,$key),"xmm0"); 2395 &lea ($key,&DWP(16,$key)); 2396 2397 &shufps ("xmm4","xmm2",0b00010000); 2398 &xorps ("xmm2","xmm4"); 2399 &shufps ("xmm4","xmm2",0b10001100); 2400 &xorps ("xmm2","xmm4"); 2401 &shufps ("xmm1","xmm1",0b10101010); # critical path 2402 &xorps ("xmm2","xmm1"); 2403 &ret(); 2404 2405 &set_label("14rounds_alt",16); 2406 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2407 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2408 &mov ($rounds,7); 2409 &movdqu (&QWP(-32,$key),"xmm0"); 2410 &movdqa ("xmm1","xmm2"); 2411 &movdqu (&QWP(-16,$key),"xmm2"); 2412 2413 &set_label("loop_key256"); 2414 &pshufb ("xmm2","xmm5"); 2415 &aesenclast ("xmm2","xmm4"); 2416 2417 &movdqa ("xmm3","xmm0"); 2418 &pslldq ("xmm0",4); 2419 &pxor ("xmm3","xmm0"); 2420 &pslldq ("xmm0",4); 2421 &pxor ("xmm3","xmm0"); 2422 &pslldq ("xmm0",4); 2423 &pxor ("xmm0","xmm3"); 2424 &pslld ("xmm4",1); 2425 2426 &pxor ("xmm0","xmm2"); 2427 &movdqu (&QWP(0,$key),"xmm0"); 2428 2429 &dec ($rounds); 2430 &jz (&label("done_key256")); 2431 2432 &pshufd ("xmm2","xmm0",0xff); 2433 &pxor ("xmm3","xmm3"); 2434 &aesenclast ("xmm2","xmm3"); 2435 2436 &movdqa ("xmm3","xmm1") 2437 &pslldq ("xmm1",4); 2438 &pxor ("xmm3","xmm1"); 2439 &pslldq ("xmm1",4); 2440 &pxor ("xmm3","xmm1"); 2441 &pslldq ("xmm1",4); 2442 &pxor ("xmm1","xmm3"); 2443 2444 &pxor ("xmm2","xmm1"); 2445 &movdqu (&QWP(16,$key),"xmm2"); 2446 &lea ($key,&DWP(32,$key)); 2447 &movdqa ("xmm1","xmm2"); 2448 &jmp (&label("loop_key256")); 2449 2450 &set_label("done_key256"); 2451 &mov ($rounds,13); 2452 &mov (&DWP(16,$key),$rounds); 2453 2454 &set_label("good_key"); 2455 &pxor ("xmm0","xmm0"); 2456 &pxor ("xmm1","xmm1"); 2457 &pxor ("xmm2","xmm2"); 2458 &pxor ("xmm3","xmm3"); 2459 &pxor ("xmm4","xmm4"); 2460 &pxor ("xmm5","xmm5"); 2461 &xor ("eax","eax"); 2462 &pop ("ebx"); 2463 &pop ("ebp"); 2464 &ret (); 2465 2466 &set_label("bad_pointer",4); 2467 &mov ("eax",-1); 2468 &pop ("ebx"); 2469 &pop ("ebp"); 2470 &ret (); 2471 &set_label("bad_keybits",4); 2472 &pxor ("xmm0","xmm0"); 2473 &mov ("eax",-2); 2474 &pop ("ebx"); 2475 &pop ("ebp"); 2476 &ret (); 2477 &function_end_B("_aesni_set_encrypt_key"); 2478 2479 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2480 # AES_KEY *key) 2481 &function_begin_B("${PREFIX}_set_encrypt_key"); 2482 &mov ("eax",&wparam(0)); 2483 &mov ($rounds,&wparam(1)); 2484 &mov ($key,&wparam(2)); 2485 &call ("_aesni_set_encrypt_key"); 2486 &ret (); 2487 &function_end_B("${PREFIX}_set_encrypt_key"); 2488 2489 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2490 # AES_KEY *key) 2491 &function_begin_B("${PREFIX}_set_decrypt_key"); 2492 &mov ("eax",&wparam(0)); 2493 &mov ($rounds,&wparam(1)); 2494 &mov ($key,&wparam(2)); 2495 &call ("_aesni_set_encrypt_key"); 2496 &mov ($key,&wparam(2)); 2497 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 2498 &test ("eax","eax"); 2499 &jnz (&label("dec_key_ret")); 2500 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2501 2502 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2503 &$movekey ("xmm1",&QWP(0,"eax")); 2504 &$movekey (&QWP(0,"eax"),"xmm0"); 2505 &$movekey (&QWP(0,$key),"xmm1"); 2506 &lea ($key,&DWP(16,$key)); 2507 &lea ("eax",&DWP(-16,"eax")); 2508 2509 &set_label("dec_key_inverse"); 2510 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2511 &$movekey ("xmm1",&QWP(0,"eax")); 2512 &aesimc ("xmm0","xmm0"); 2513 &aesimc ("xmm1","xmm1"); 2514 &lea ($key,&DWP(16,$key)); 2515 &lea ("eax",&DWP(-16,"eax")); 2516 &$movekey (&QWP(16,"eax"),"xmm0"); 2517 &$movekey (&QWP(-16,$key),"xmm1"); 2518 &cmp ("eax",$key); 2519 &ja (&label("dec_key_inverse")); 2520 2521 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2522 &aesimc ("xmm0","xmm0"); 2523 &$movekey (&QWP(0,$key),"xmm0"); 2524 2525 &pxor ("xmm0","xmm0"); 2526 &pxor ("xmm1","xmm1"); 2527 &xor ("eax","eax"); # return success 2528 &set_label("dec_key_ret"); 2529 &ret (); 2530 &function_end_B("${PREFIX}_set_decrypt_key"); 2531 2532 &set_label("key_const",64); 2533 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 2534 &data_word(0x04070605,0x04070605,0x04070605,0x04070605); 2535 &data_word(1,1,1,1); 2536 &data_word(0x1b,0x1b,0x1b,0x1b); 2537 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2538 2539 &asm_finish(); 2540 2541 close STDOUT; 2542