1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # To start with see corresponding paragraph in aesni-x86_64.pl... 18 # Instead of filling table similar to one found there I've chosen to 19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20 # The simplified table below represents 32-bit performance relative 21 # to 64-bit one in every given point. Ratios vary for different 22 # encryption modes, therefore interval values. 23 # 24 # 16-byte 64-byte 256-byte 1-KB 8-KB 25 # 53-67% 67-84% 91-94% 95-98% 97-99.5% 26 # 27 # Lower ratios for smaller block sizes are perfectly understandable, 28 # because function call overhead is higher in 32-bit mode. Largest 29 # 8-KB block performance is virtually same: 32-bit code is less than 30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32 # January 2011 33 # 34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module 35 # interleaves at most 6 aes[enc|dec] instructions, because there are 36 # not enough registers for 8x interleave [which should be optimal for 37 # Sandy Bridge]. Actually, performance results for 6x interleave 38 # factor presented in aesni-x86_64.pl (except for CTR) are for this 39 # module. 40 41 # April 2011 42 # 43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46 ###################################################################### 47 # Current large-block performance in cycles per byte processed with 48 # 128-bit key (less is better). 49 # 50 # CBC en-/decrypt CTR XTS ECB 51 # Westmere 3.77/1.37 1.37 1.52 1.27 52 # * Bridge 5.07/0.98 0.99 1.09 0.91 53 # Haswell 4.44/0.80 0.97 1.03 0.72 54 # Silvermont 5.77/3.56 3.67 4.03 3.46 55 # Bulldozer 5.80/0.98 1.05 1.24 0.93 56 57 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 58 # generates drop-in replacement for 59 # crypto/aes/asm/aes-586.pl:-) 60 $inline=1; # inline _aesni_[en|de]crypt 61 62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63 push(@INC,"${dir}","${dir}../../perlasm"); 64 require "x86asm.pl"; 65 66 &asm_init($ARGV[0],$0); 67 68 &external_label("OPENSSL_ia32cap_P"); 69 &static_label("key_const"); 70 71 if ($PREFIX eq "aesni") { $movekey=\&movups; } 72 else { $movekey=\&movups; } 73 74 $len="eax"; 75 $rounds="ecx"; 76 $key="edx"; 77 $inp="esi"; 78 $out="edi"; 79 $rounds_="ebx"; # backup copy for $rounds 80 $key_="ebp"; # backup copy for $key 81 82 $rndkey0="xmm0"; 83 $rndkey1="xmm1"; 84 $inout0="xmm2"; 85 $inout1="xmm3"; 86 $inout2="xmm4"; 87 $inout3="xmm5"; $in1="xmm5"; 88 $inout4="xmm6"; $in0="xmm6"; 89 $inout5="xmm7"; $ivec="xmm7"; 90 91 # AESNI extension 92 sub aeskeygenassist 93 { my($dst,$src,$imm)=@_; 94 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 95 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 96 } 97 sub aescommon 98 { my($opcodelet,$dst,$src)=@_; 99 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 100 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 101 } 102 sub aesimc { aescommon(0xdb,@_); } 103 sub aesenc { aescommon(0xdc,@_); } 104 sub aesenclast { aescommon(0xdd,@_); } 105 sub aesdec { aescommon(0xde,@_); } 106 sub aesdeclast { aescommon(0xdf,@_); } 107 109 # Inline version of internal aesni_[en|de]crypt1 110 { my $sn; 111 sub aesni_inline_generate1 112 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 113 $sn++; 114 115 &$movekey ($rndkey0,&QWP(0,$key)); 116 &$movekey ($rndkey1,&QWP(16,$key)); 117 &xorps ($ivec,$rndkey0) if (defined($ivec)); 118 &lea ($key,&DWP(32,$key)); 119 &xorps ($inout,$ivec) if (defined($ivec)); 120 &xorps ($inout,$rndkey0) if (!defined($ivec)); 121 &set_label("${p}1_loop_$sn"); 122 eval"&aes${p} ($inout,$rndkey1)"; 123 &dec ($rounds); 124 &$movekey ($rndkey1,&QWP(0,$key)); 125 &lea ($key,&DWP(16,$key)); 126 &jnz (&label("${p}1_loop_$sn")); 127 eval"&aes${p}last ($inout,$rndkey1)"; 128 }} 129 130 sub aesni_generate1 # fully unrolled loop 131 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 132 133 &function_begin_B("_aesni_${p}rypt1"); 134 &movups ($rndkey0,&QWP(0,$key)); 135 &$movekey ($rndkey1,&QWP(0x10,$key)); 136 &xorps ($inout,$rndkey0); 137 &$movekey ($rndkey0,&QWP(0x20,$key)); 138 &lea ($key,&DWP(0x30,$key)); 139 &cmp ($rounds,11); 140 &jb (&label("${p}128")); 141 &lea ($key,&DWP(0x20,$key)); 142 &je (&label("${p}192")); 143 &lea ($key,&DWP(0x20,$key)); 144 eval"&aes${p} ($inout,$rndkey1)"; 145 &$movekey ($rndkey1,&QWP(-0x40,$key)); 146 eval"&aes${p} ($inout,$rndkey0)"; 147 &$movekey ($rndkey0,&QWP(-0x30,$key)); 148 &set_label("${p}192"); 149 eval"&aes${p} ($inout,$rndkey1)"; 150 &$movekey ($rndkey1,&QWP(-0x20,$key)); 151 eval"&aes${p} ($inout,$rndkey0)"; 152 &$movekey ($rndkey0,&QWP(-0x10,$key)); 153 &set_label("${p}128"); 154 eval"&aes${p} ($inout,$rndkey1)"; 155 &$movekey ($rndkey1,&QWP(0,$key)); 156 eval"&aes${p} ($inout,$rndkey0)"; 157 &$movekey ($rndkey0,&QWP(0x10,$key)); 158 eval"&aes${p} ($inout,$rndkey1)"; 159 &$movekey ($rndkey1,&QWP(0x20,$key)); 160 eval"&aes${p} ($inout,$rndkey0)"; 161 &$movekey ($rndkey0,&QWP(0x30,$key)); 162 eval"&aes${p} ($inout,$rndkey1)"; 163 &$movekey ($rndkey1,&QWP(0x40,$key)); 164 eval"&aes${p} ($inout,$rndkey0)"; 165 &$movekey ($rndkey0,&QWP(0x50,$key)); 166 eval"&aes${p} ($inout,$rndkey1)"; 167 &$movekey ($rndkey1,&QWP(0x60,$key)); 168 eval"&aes${p} ($inout,$rndkey0)"; 169 &$movekey ($rndkey0,&QWP(0x70,$key)); 170 eval"&aes${p} ($inout,$rndkey1)"; 171 eval"&aes${p}last ($inout,$rndkey0)"; 172 &ret(); 173 &function_end_B("_aesni_${p}rypt1"); 174 } 175 177 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 178 &aesni_generate1("enc") if (!$inline); 179 &function_begin_B("${PREFIX}_encrypt"); 180 &mov ("eax",&wparam(0)); 181 &mov ($key,&wparam(2)); 182 &movups ($inout0,&QWP(0,"eax")); 183 &mov ($rounds,&DWP(240,$key)); 184 &mov ("eax",&wparam(1)); 185 if ($inline) 186 { &aesni_inline_generate1("enc"); } 187 else 188 { &call ("_aesni_encrypt1"); } 189 &pxor ($rndkey0,$rndkey0); # clear register bank 190 &pxor ($rndkey1,$rndkey1); 191 &movups (&QWP(0,"eax"),$inout0); 192 &pxor ($inout0,$inout0); 193 &ret (); 194 &function_end_B("${PREFIX}_encrypt"); 195 196 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 197 &aesni_generate1("dec") if(!$inline); 198 &function_begin_B("${PREFIX}_decrypt"); 199 &mov ("eax",&wparam(0)); 200 &mov ($key,&wparam(2)); 201 &movups ($inout0,&QWP(0,"eax")); 202 &mov ($rounds,&DWP(240,$key)); 203 &mov ("eax",&wparam(1)); 204 if ($inline) 205 { &aesni_inline_generate1("dec"); } 206 else 207 { &call ("_aesni_decrypt1"); } 208 &pxor ($rndkey0,$rndkey0); # clear register bank 209 &pxor ($rndkey1,$rndkey1); 210 &movups (&QWP(0,"eax"),$inout0); 211 &pxor ($inout0,$inout0); 212 &ret (); 213 &function_end_B("${PREFIX}_decrypt"); 214 215 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 216 # factor. Why 3x subroutine were originally used in loops? Even though 217 # aes[enc|dec] latency was originally 6, it could be scheduled only 218 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 219 # utilization, i.e. when subroutine's throughput is virtually same as 220 # of non-interleaved subroutine [for number of input blocks up to 3]. 221 # This is why it originally made no sense to implement 2x subroutine. 222 # But times change and it became appropriate to spend extra 192 bytes 223 # on 2x subroutine on Atom Silvermont account. For processors that 224 # can schedule aes[enc|dec] every cycle optimal interleave factor 225 # equals to corresponding instructions latency. 8x is optimal for 226 # * Bridge, but it's unfeasible to accommodate such implementation 227 # in XMM registers addreassable in 32-bit mode and therefore maximum 228 # of 6x is used instead... 229 230 sub aesni_generate2 231 { my $p=shift; 232 233 &function_begin_B("_aesni_${p}rypt2"); 234 &$movekey ($rndkey0,&QWP(0,$key)); 235 &shl ($rounds,4); 236 &$movekey ($rndkey1,&QWP(16,$key)); 237 &xorps ($inout0,$rndkey0); 238 &pxor ($inout1,$rndkey0); 239 &$movekey ($rndkey0,&QWP(32,$key)); 240 &lea ($key,&DWP(32,$key,$rounds)); 241 &neg ($rounds); 242 &add ($rounds,16); 243 244 &set_label("${p}2_loop"); 245 eval"&aes${p} ($inout0,$rndkey1)"; 246 eval"&aes${p} ($inout1,$rndkey1)"; 247 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 248 &add ($rounds,32); 249 eval"&aes${p} ($inout0,$rndkey0)"; 250 eval"&aes${p} ($inout1,$rndkey0)"; 251 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 252 &jnz (&label("${p}2_loop")); 253 eval"&aes${p} ($inout0,$rndkey1)"; 254 eval"&aes${p} ($inout1,$rndkey1)"; 255 eval"&aes${p}last ($inout0,$rndkey0)"; 256 eval"&aes${p}last ($inout1,$rndkey0)"; 257 &ret(); 258 &function_end_B("_aesni_${p}rypt2"); 259 } 260 261 sub aesni_generate3 262 { my $p=shift; 263 264 &function_begin_B("_aesni_${p}rypt3"); 265 &$movekey ($rndkey0,&QWP(0,$key)); 266 &shl ($rounds,4); 267 &$movekey ($rndkey1,&QWP(16,$key)); 268 &xorps ($inout0,$rndkey0); 269 &pxor ($inout1,$rndkey0); 270 &pxor ($inout2,$rndkey0); 271 &$movekey ($rndkey0,&QWP(32,$key)); 272 &lea ($key,&DWP(32,$key,$rounds)); 273 &neg ($rounds); 274 &add ($rounds,16); 275 276 &set_label("${p}3_loop"); 277 eval"&aes${p} ($inout0,$rndkey1)"; 278 eval"&aes${p} ($inout1,$rndkey1)"; 279 eval"&aes${p} ($inout2,$rndkey1)"; 280 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 281 &add ($rounds,32); 282 eval"&aes${p} ($inout0,$rndkey0)"; 283 eval"&aes${p} ($inout1,$rndkey0)"; 284 eval"&aes${p} ($inout2,$rndkey0)"; 285 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 286 &jnz (&label("${p}3_loop")); 287 eval"&aes${p} ($inout0,$rndkey1)"; 288 eval"&aes${p} ($inout1,$rndkey1)"; 289 eval"&aes${p} ($inout2,$rndkey1)"; 290 eval"&aes${p}last ($inout0,$rndkey0)"; 291 eval"&aes${p}last ($inout1,$rndkey0)"; 292 eval"&aes${p}last ($inout2,$rndkey0)"; 293 &ret(); 294 &function_end_B("_aesni_${p}rypt3"); 295 } 296 297 # 4x interleave is implemented to improve small block performance, 298 # most notably [and naturally] 4 block by ~30%. One can argue that one 299 # should have implemented 5x as well, but improvement would be <20%, 300 # so it's not worth it... 301 sub aesni_generate4 302 { my $p=shift; 303 304 &function_begin_B("_aesni_${p}rypt4"); 305 &$movekey ($rndkey0,&QWP(0,$key)); 306 &$movekey ($rndkey1,&QWP(16,$key)); 307 &shl ($rounds,4); 308 &xorps ($inout0,$rndkey0); 309 &pxor ($inout1,$rndkey0); 310 &pxor ($inout2,$rndkey0); 311 &pxor ($inout3,$rndkey0); 312 &$movekey ($rndkey0,&QWP(32,$key)); 313 &lea ($key,&DWP(32,$key,$rounds)); 314 &neg ($rounds); 315 &data_byte (0x0f,0x1f,0x40,0x00); 316 &add ($rounds,16); 317 318 &set_label("${p}4_loop"); 319 eval"&aes${p} ($inout0,$rndkey1)"; 320 eval"&aes${p} ($inout1,$rndkey1)"; 321 eval"&aes${p} ($inout2,$rndkey1)"; 322 eval"&aes${p} ($inout3,$rndkey1)"; 323 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 324 &add ($rounds,32); 325 eval"&aes${p} ($inout0,$rndkey0)"; 326 eval"&aes${p} ($inout1,$rndkey0)"; 327 eval"&aes${p} ($inout2,$rndkey0)"; 328 eval"&aes${p} ($inout3,$rndkey0)"; 329 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 330 &jnz (&label("${p}4_loop")); 331 332 eval"&aes${p} ($inout0,$rndkey1)"; 333 eval"&aes${p} ($inout1,$rndkey1)"; 334 eval"&aes${p} ($inout2,$rndkey1)"; 335 eval"&aes${p} ($inout3,$rndkey1)"; 336 eval"&aes${p}last ($inout0,$rndkey0)"; 337 eval"&aes${p}last ($inout1,$rndkey0)"; 338 eval"&aes${p}last ($inout2,$rndkey0)"; 339 eval"&aes${p}last ($inout3,$rndkey0)"; 340 &ret(); 341 &function_end_B("_aesni_${p}rypt4"); 342 } 343 344 sub aesni_generate6 345 { my $p=shift; 346 347 &function_begin_B("_aesni_${p}rypt6"); 348 &static_label("_aesni_${p}rypt6_enter"); 349 &$movekey ($rndkey0,&QWP(0,$key)); 350 &shl ($rounds,4); 351 &$movekey ($rndkey1,&QWP(16,$key)); 352 &xorps ($inout0,$rndkey0); 353 &pxor ($inout1,$rndkey0); # pxor does better here 354 &pxor ($inout2,$rndkey0); 355 eval"&aes${p} ($inout0,$rndkey1)"; 356 &pxor ($inout3,$rndkey0); 357 &pxor ($inout4,$rndkey0); 358 eval"&aes${p} ($inout1,$rndkey1)"; 359 &lea ($key,&DWP(32,$key,$rounds)); 360 &neg ($rounds); 361 eval"&aes${p} ($inout2,$rndkey1)"; 362 &pxor ($inout5,$rndkey0); 363 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 364 &add ($rounds,16); 365 &jmp (&label("_aesni_${p}rypt6_inner")); 366 367 &set_label("${p}6_loop",16); 368 eval"&aes${p} ($inout0,$rndkey1)"; 369 eval"&aes${p} ($inout1,$rndkey1)"; 370 eval"&aes${p} ($inout2,$rndkey1)"; 371 &set_label("_aesni_${p}rypt6_inner"); 372 eval"&aes${p} ($inout3,$rndkey1)"; 373 eval"&aes${p} ($inout4,$rndkey1)"; 374 eval"&aes${p} ($inout5,$rndkey1)"; 375 &set_label("_aesni_${p}rypt6_enter"); 376 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 377 &add ($rounds,32); 378 eval"&aes${p} ($inout0,$rndkey0)"; 379 eval"&aes${p} ($inout1,$rndkey0)"; 380 eval"&aes${p} ($inout2,$rndkey0)"; 381 eval"&aes${p} ($inout3,$rndkey0)"; 382 eval"&aes${p} ($inout4,$rndkey0)"; 383 eval"&aes${p} ($inout5,$rndkey0)"; 384 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 385 &jnz (&label("${p}6_loop")); 386 387 eval"&aes${p} ($inout0,$rndkey1)"; 388 eval"&aes${p} ($inout1,$rndkey1)"; 389 eval"&aes${p} ($inout2,$rndkey1)"; 390 eval"&aes${p} ($inout3,$rndkey1)"; 391 eval"&aes${p} ($inout4,$rndkey1)"; 392 eval"&aes${p} ($inout5,$rndkey1)"; 393 eval"&aes${p}last ($inout0,$rndkey0)"; 394 eval"&aes${p}last ($inout1,$rndkey0)"; 395 eval"&aes${p}last ($inout2,$rndkey0)"; 396 eval"&aes${p}last ($inout3,$rndkey0)"; 397 eval"&aes${p}last ($inout4,$rndkey0)"; 398 eval"&aes${p}last ($inout5,$rndkey0)"; 399 &ret(); 400 &function_end_B("_aesni_${p}rypt6"); 401 } 402 &aesni_generate2("enc") if ($PREFIX eq "aesni"); 403 &aesni_generate2("dec"); 404 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 405 &aesni_generate3("dec"); 406 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 407 &aesni_generate4("dec"); 408 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 409 &aesni_generate6("dec"); 410 412 if ($PREFIX eq "aesni") { 413 ###################################################################### 414 # void aesni_ecb_encrypt (const void *in, void *out, 415 # size_t length, const AES_KEY *key, 416 # int enc); 417 &function_begin("aesni_ecb_encrypt"); 418 &mov ($inp,&wparam(0)); 419 &mov ($out,&wparam(1)); 420 &mov ($len,&wparam(2)); 421 &mov ($key,&wparam(3)); 422 &mov ($rounds_,&wparam(4)); 423 &and ($len,-16); 424 &jz (&label("ecb_ret")); 425 &mov ($rounds,&DWP(240,$key)); 426 &test ($rounds_,$rounds_); 427 &jz (&label("ecb_decrypt")); 428 429 &mov ($key_,$key); # backup $key 430 &mov ($rounds_,$rounds); # backup $rounds 431 &cmp ($len,0x60); 432 &jb (&label("ecb_enc_tail")); 433 434 &movdqu ($inout0,&QWP(0,$inp)); 435 &movdqu ($inout1,&QWP(0x10,$inp)); 436 &movdqu ($inout2,&QWP(0x20,$inp)); 437 &movdqu ($inout3,&QWP(0x30,$inp)); 438 &movdqu ($inout4,&QWP(0x40,$inp)); 439 &movdqu ($inout5,&QWP(0x50,$inp)); 440 &lea ($inp,&DWP(0x60,$inp)); 441 &sub ($len,0x60); 442 &jmp (&label("ecb_enc_loop6_enter")); 443 444 &set_label("ecb_enc_loop6",16); 445 &movups (&QWP(0,$out),$inout0); 446 &movdqu ($inout0,&QWP(0,$inp)); 447 &movups (&QWP(0x10,$out),$inout1); 448 &movdqu ($inout1,&QWP(0x10,$inp)); 449 &movups (&QWP(0x20,$out),$inout2); 450 &movdqu ($inout2,&QWP(0x20,$inp)); 451 &movups (&QWP(0x30,$out),$inout3); 452 &movdqu ($inout3,&QWP(0x30,$inp)); 453 &movups (&QWP(0x40,$out),$inout4); 454 &movdqu ($inout4,&QWP(0x40,$inp)); 455 &movups (&QWP(0x50,$out),$inout5); 456 &lea ($out,&DWP(0x60,$out)); 457 &movdqu ($inout5,&QWP(0x50,$inp)); 458 &lea ($inp,&DWP(0x60,$inp)); 459 &set_label("ecb_enc_loop6_enter"); 460 461 &call ("_aesni_encrypt6"); 462 463 &mov ($key,$key_); # restore $key 464 &mov ($rounds,$rounds_); # restore $rounds 465 &sub ($len,0x60); 466 &jnc (&label("ecb_enc_loop6")); 467 468 &movups (&QWP(0,$out),$inout0); 469 &movups (&QWP(0x10,$out),$inout1); 470 &movups (&QWP(0x20,$out),$inout2); 471 &movups (&QWP(0x30,$out),$inout3); 472 &movups (&QWP(0x40,$out),$inout4); 473 &movups (&QWP(0x50,$out),$inout5); 474 &lea ($out,&DWP(0x60,$out)); 475 &add ($len,0x60); 476 &jz (&label("ecb_ret")); 477 478 &set_label("ecb_enc_tail"); 479 &movups ($inout0,&QWP(0,$inp)); 480 &cmp ($len,0x20); 481 &jb (&label("ecb_enc_one")); 482 &movups ($inout1,&QWP(0x10,$inp)); 483 &je (&label("ecb_enc_two")); 484 &movups ($inout2,&QWP(0x20,$inp)); 485 &cmp ($len,0x40); 486 &jb (&label("ecb_enc_three")); 487 &movups ($inout3,&QWP(0x30,$inp)); 488 &je (&label("ecb_enc_four")); 489 &movups ($inout4,&QWP(0x40,$inp)); 490 &xorps ($inout5,$inout5); 491 &call ("_aesni_encrypt6"); 492 &movups (&QWP(0,$out),$inout0); 493 &movups (&QWP(0x10,$out),$inout1); 494 &movups (&QWP(0x20,$out),$inout2); 495 &movups (&QWP(0x30,$out),$inout3); 496 &movups (&QWP(0x40,$out),$inout4); 497 jmp (&label("ecb_ret")); 498 499 &set_label("ecb_enc_one",16); 500 if ($inline) 501 { &aesni_inline_generate1("enc"); } 502 else 503 { &call ("_aesni_encrypt1"); } 504 &movups (&QWP(0,$out),$inout0); 505 &jmp (&label("ecb_ret")); 506 507 &set_label("ecb_enc_two",16); 508 &call ("_aesni_encrypt2"); 509 &movups (&QWP(0,$out),$inout0); 510 &movups (&QWP(0x10,$out),$inout1); 511 &jmp (&label("ecb_ret")); 512 513 &set_label("ecb_enc_three",16); 514 &call ("_aesni_encrypt3"); 515 &movups (&QWP(0,$out),$inout0); 516 &movups (&QWP(0x10,$out),$inout1); 517 &movups (&QWP(0x20,$out),$inout2); 518 &jmp (&label("ecb_ret")); 519 520 &set_label("ecb_enc_four",16); 521 &call ("_aesni_encrypt4"); 522 &movups (&QWP(0,$out),$inout0); 523 &movups (&QWP(0x10,$out),$inout1); 524 &movups (&QWP(0x20,$out),$inout2); 525 &movups (&QWP(0x30,$out),$inout3); 526 &jmp (&label("ecb_ret")); 527 ###################################################################### 528 &set_label("ecb_decrypt",16); 529 &mov ($key_,$key); # backup $key 530 &mov ($rounds_,$rounds); # backup $rounds 531 &cmp ($len,0x60); 532 &jb (&label("ecb_dec_tail")); 533 534 &movdqu ($inout0,&QWP(0,$inp)); 535 &movdqu ($inout1,&QWP(0x10,$inp)); 536 &movdqu ($inout2,&QWP(0x20,$inp)); 537 &movdqu ($inout3,&QWP(0x30,$inp)); 538 &movdqu ($inout4,&QWP(0x40,$inp)); 539 &movdqu ($inout5,&QWP(0x50,$inp)); 540 &lea ($inp,&DWP(0x60,$inp)); 541 &sub ($len,0x60); 542 &jmp (&label("ecb_dec_loop6_enter")); 543 544 &set_label("ecb_dec_loop6",16); 545 &movups (&QWP(0,$out),$inout0); 546 &movdqu ($inout0,&QWP(0,$inp)); 547 &movups (&QWP(0x10,$out),$inout1); 548 &movdqu ($inout1,&QWP(0x10,$inp)); 549 &movups (&QWP(0x20,$out),$inout2); 550 &movdqu ($inout2,&QWP(0x20,$inp)); 551 &movups (&QWP(0x30,$out),$inout3); 552 &movdqu ($inout3,&QWP(0x30,$inp)); 553 &movups (&QWP(0x40,$out),$inout4); 554 &movdqu ($inout4,&QWP(0x40,$inp)); 555 &movups (&QWP(0x50,$out),$inout5); 556 &lea ($out,&DWP(0x60,$out)); 557 &movdqu ($inout5,&QWP(0x50,$inp)); 558 &lea ($inp,&DWP(0x60,$inp)); 559 &set_label("ecb_dec_loop6_enter"); 560 561 &call ("_aesni_decrypt6"); 562 563 &mov ($key,$key_); # restore $key 564 &mov ($rounds,$rounds_); # restore $rounds 565 &sub ($len,0x60); 566 &jnc (&label("ecb_dec_loop6")); 567 568 &movups (&QWP(0,$out),$inout0); 569 &movups (&QWP(0x10,$out),$inout1); 570 &movups (&QWP(0x20,$out),$inout2); 571 &movups (&QWP(0x30,$out),$inout3); 572 &movups (&QWP(0x40,$out),$inout4); 573 &movups (&QWP(0x50,$out),$inout5); 574 &lea ($out,&DWP(0x60,$out)); 575 &add ($len,0x60); 576 &jz (&label("ecb_ret")); 577 578 &set_label("ecb_dec_tail"); 579 &movups ($inout0,&QWP(0,$inp)); 580 &cmp ($len,0x20); 581 &jb (&label("ecb_dec_one")); 582 &movups ($inout1,&QWP(0x10,$inp)); 583 &je (&label("ecb_dec_two")); 584 &movups ($inout2,&QWP(0x20,$inp)); 585 &cmp ($len,0x40); 586 &jb (&label("ecb_dec_three")); 587 &movups ($inout3,&QWP(0x30,$inp)); 588 &je (&label("ecb_dec_four")); 589 &movups ($inout4,&QWP(0x40,$inp)); 590 &xorps ($inout5,$inout5); 591 &call ("_aesni_decrypt6"); 592 &movups (&QWP(0,$out),$inout0); 593 &movups (&QWP(0x10,$out),$inout1); 594 &movups (&QWP(0x20,$out),$inout2); 595 &movups (&QWP(0x30,$out),$inout3); 596 &movups (&QWP(0x40,$out),$inout4); 597 &jmp (&label("ecb_ret")); 598 599 &set_label("ecb_dec_one",16); 600 if ($inline) 601 { &aesni_inline_generate1("dec"); } 602 else 603 { &call ("_aesni_decrypt1"); } 604 &movups (&QWP(0,$out),$inout0); 605 &jmp (&label("ecb_ret")); 606 607 &set_label("ecb_dec_two",16); 608 &call ("_aesni_decrypt2"); 609 &movups (&QWP(0,$out),$inout0); 610 &movups (&QWP(0x10,$out),$inout1); 611 &jmp (&label("ecb_ret")); 612 613 &set_label("ecb_dec_three",16); 614 &call ("_aesni_decrypt3"); 615 &movups (&QWP(0,$out),$inout0); 616 &movups (&QWP(0x10,$out),$inout1); 617 &movups (&QWP(0x20,$out),$inout2); 618 &jmp (&label("ecb_ret")); 619 620 &set_label("ecb_dec_four",16); 621 &call ("_aesni_decrypt4"); 622 &movups (&QWP(0,$out),$inout0); 623 &movups (&QWP(0x10,$out),$inout1); 624 &movups (&QWP(0x20,$out),$inout2); 625 &movups (&QWP(0x30,$out),$inout3); 626 627 &set_label("ecb_ret"); 628 &pxor ("xmm0","xmm0"); # clear register bank 629 &pxor ("xmm1","xmm1"); 630 &pxor ("xmm2","xmm2"); 631 &pxor ("xmm3","xmm3"); 632 &pxor ("xmm4","xmm4"); 633 &pxor ("xmm5","xmm5"); 634 &pxor ("xmm6","xmm6"); 635 &pxor ("xmm7","xmm7"); 636 &function_end("aesni_ecb_encrypt"); 637 639 ###################################################################### 640 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 641 # size_t blocks, const AES_KEY *key, 642 # const char *ivec,char *cmac); 643 # 644 # Handles only complete blocks, operates on 64-bit counter and 645 # does not update *ivec! Nor does it finalize CMAC value 646 # (see engine/eng_aesni.c for details) 647 # 648 { my $cmac=$inout1; 649 &function_begin("aesni_ccm64_encrypt_blocks"); 650 &mov ($inp,&wparam(0)); 651 &mov ($out,&wparam(1)); 652 &mov ($len,&wparam(2)); 653 &mov ($key,&wparam(3)); 654 &mov ($rounds_,&wparam(4)); 655 &mov ($rounds,&wparam(5)); 656 &mov ($key_,"esp"); 657 &sub ("esp",60); 658 &and ("esp",-16); # align stack 659 &mov (&DWP(48,"esp"),$key_); 660 661 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 662 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 663 &mov ($rounds,&DWP(240,$key)); 664 665 # compose byte-swap control mask for pshufb on stack 666 &mov (&DWP(0,"esp"),0x0c0d0e0f); 667 &mov (&DWP(4,"esp"),0x08090a0b); 668 &mov (&DWP(8,"esp"),0x04050607); 669 &mov (&DWP(12,"esp"),0x00010203); 670 671 # compose counter increment vector on stack 672 &mov ($rounds_,1); 673 &xor ($key_,$key_); 674 &mov (&DWP(16,"esp"),$rounds_); 675 &mov (&DWP(20,"esp"),$key_); 676 &mov (&DWP(24,"esp"),$key_); 677 &mov (&DWP(28,"esp"),$key_); 678 679 &shl ($rounds,4); 680 &mov ($rounds_,16); 681 &lea ($key_,&DWP(0,$key)); 682 &movdqa ($inout3,&QWP(0,"esp")); 683 &movdqa ($inout0,$ivec); 684 &lea ($key,&DWP(32,$key,$rounds)); 685 &sub ($rounds_,$rounds); 686 &pshufb ($ivec,$inout3); 687 688 &set_label("ccm64_enc_outer"); 689 &$movekey ($rndkey0,&QWP(0,$key_)); 690 &mov ($rounds,$rounds_); 691 &movups ($in0,&QWP(0,$inp)); 692 693 &xorps ($inout0,$rndkey0); 694 &$movekey ($rndkey1,&QWP(16,$key_)); 695 &xorps ($rndkey0,$in0); 696 &xorps ($cmac,$rndkey0); # cmac^=inp 697 &$movekey ($rndkey0,&QWP(32,$key_)); 698 699 &set_label("ccm64_enc2_loop"); 700 &aesenc ($inout0,$rndkey1); 701 &aesenc ($cmac,$rndkey1); 702 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 703 &add ($rounds,32); 704 &aesenc ($inout0,$rndkey0); 705 &aesenc ($cmac,$rndkey0); 706 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 707 &jnz (&label("ccm64_enc2_loop")); 708 &aesenc ($inout0,$rndkey1); 709 &aesenc ($cmac,$rndkey1); 710 &paddq ($ivec,&QWP(16,"esp")); 711 &dec ($len); 712 &aesenclast ($inout0,$rndkey0); 713 &aesenclast ($cmac,$rndkey0); 714 715 &lea ($inp,&DWP(16,$inp)); 716 &xorps ($in0,$inout0); # inp^=E(ivec) 717 &movdqa ($inout0,$ivec); 718 &movups (&QWP(0,$out),$in0); # save output 719 &pshufb ($inout0,$inout3); 720 &lea ($out,&DWP(16,$out)); 721 &jnz (&label("ccm64_enc_outer")); 722 723 &mov ("esp",&DWP(48,"esp")); 724 &mov ($out,&wparam(5)); 725 &movups (&QWP(0,$out),$cmac); 726 727 &pxor ("xmm0","xmm0"); # clear register bank 728 &pxor ("xmm1","xmm1"); 729 &pxor ("xmm2","xmm2"); 730 &pxor ("xmm3","xmm3"); 731 &pxor ("xmm4","xmm4"); 732 &pxor ("xmm5","xmm5"); 733 &pxor ("xmm6","xmm6"); 734 &pxor ("xmm7","xmm7"); 735 &function_end("aesni_ccm64_encrypt_blocks"); 736 737 &function_begin("aesni_ccm64_decrypt_blocks"); 738 &mov ($inp,&wparam(0)); 739 &mov ($out,&wparam(1)); 740 &mov ($len,&wparam(2)); 741 &mov ($key,&wparam(3)); 742 &mov ($rounds_,&wparam(4)); 743 &mov ($rounds,&wparam(5)); 744 &mov ($key_,"esp"); 745 &sub ("esp",60); 746 &and ("esp",-16); # align stack 747 &mov (&DWP(48,"esp"),$key_); 748 749 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 750 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 751 &mov ($rounds,&DWP(240,$key)); 752 753 # compose byte-swap control mask for pshufb on stack 754 &mov (&DWP(0,"esp"),0x0c0d0e0f); 755 &mov (&DWP(4,"esp"),0x08090a0b); 756 &mov (&DWP(8,"esp"),0x04050607); 757 &mov (&DWP(12,"esp"),0x00010203); 758 759 # compose counter increment vector on stack 760 &mov ($rounds_,1); 761 &xor ($key_,$key_); 762 &mov (&DWP(16,"esp"),$rounds_); 763 &mov (&DWP(20,"esp"),$key_); 764 &mov (&DWP(24,"esp"),$key_); 765 &mov (&DWP(28,"esp"),$key_); 766 767 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 768 &movdqa ($inout0,$ivec); 769 770 &mov ($key_,$key); 771 &mov ($rounds_,$rounds); 772 773 &pshufb ($ivec,$inout3); 774 if ($inline) 775 { &aesni_inline_generate1("enc"); } 776 else 777 { &call ("_aesni_encrypt1"); } 778 &shl ($rounds_,4); 779 &mov ($rounds,16); 780 &movups ($in0,&QWP(0,$inp)); # load inp 781 &paddq ($ivec,&QWP(16,"esp")); 782 &lea ($inp,&QWP(16,$inp)); 783 &sub ($rounds,$rounds_); 784 &lea ($key,&DWP(32,$key_,$rounds_)); 785 &mov ($rounds_,$rounds); 786 &jmp (&label("ccm64_dec_outer")); 787 788 &set_label("ccm64_dec_outer",16); 789 &xorps ($in0,$inout0); # inp ^= E(ivec) 790 &movdqa ($inout0,$ivec); 791 &movups (&QWP(0,$out),$in0); # save output 792 &lea ($out,&DWP(16,$out)); 793 &pshufb ($inout0,$inout3); 794 795 &sub ($len,1); 796 &jz (&label("ccm64_dec_break")); 797 798 &$movekey ($rndkey0,&QWP(0,$key_)); 799 &mov ($rounds,$rounds_); 800 &$movekey ($rndkey1,&QWP(16,$key_)); 801 &xorps ($in0,$rndkey0); 802 &xorps ($inout0,$rndkey0); 803 &xorps ($cmac,$in0); # cmac^=out 804 &$movekey ($rndkey0,&QWP(32,$key_)); 805 806 &set_label("ccm64_dec2_loop"); 807 &aesenc ($inout0,$rndkey1); 808 &aesenc ($cmac,$rndkey1); 809 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 810 &add ($rounds,32); 811 &aesenc ($inout0,$rndkey0); 812 &aesenc ($cmac,$rndkey0); 813 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 814 &jnz (&label("ccm64_dec2_loop")); 815 &movups ($in0,&QWP(0,$inp)); # load inp 816 &paddq ($ivec,&QWP(16,"esp")); 817 &aesenc ($inout0,$rndkey1); 818 &aesenc ($cmac,$rndkey1); 819 &aesenclast ($inout0,$rndkey0); 820 &aesenclast ($cmac,$rndkey0); 821 &lea ($inp,&QWP(16,$inp)); 822 &jmp (&label("ccm64_dec_outer")); 823 824 &set_label("ccm64_dec_break",16); 825 &mov ($rounds,&DWP(240,$key_)); 826 &mov ($key,$key_); 827 if ($inline) 828 { &aesni_inline_generate1("enc",$cmac,$in0); } 829 else 830 { &call ("_aesni_encrypt1",$cmac); } 831 832 &mov ("esp",&DWP(48,"esp")); 833 &mov ($out,&wparam(5)); 834 &movups (&QWP(0,$out),$cmac); 835 836 &pxor ("xmm0","xmm0"); # clear register bank 837 &pxor ("xmm1","xmm1"); 838 &pxor ("xmm2","xmm2"); 839 &pxor ("xmm3","xmm3"); 840 &pxor ("xmm4","xmm4"); 841 &pxor ("xmm5","xmm5"); 842 &pxor ("xmm6","xmm6"); 843 &pxor ("xmm7","xmm7"); 844 &function_end("aesni_ccm64_decrypt_blocks"); 845 } 846 848 ###################################################################### 849 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 850 # size_t blocks, const AES_KEY *key, 851 # const char *ivec); 852 # 853 # Handles only complete blocks, operates on 32-bit counter and 854 # does not update *ivec! (see crypto/modes/ctr128.c for details) 855 # 856 # stack layout: 857 # 0 pshufb mask 858 # 16 vector addend: 0,6,6,6 859 # 32 counter-less ivec 860 # 48 1st triplet of counter vector 861 # 64 2nd triplet of counter vector 862 # 80 saved %esp 863 864 &function_begin("aesni_ctr32_encrypt_blocks"); 865 &mov ($inp,&wparam(0)); 866 &mov ($out,&wparam(1)); 867 &mov ($len,&wparam(2)); 868 &mov ($key,&wparam(3)); 869 &mov ($rounds_,&wparam(4)); 870 &mov ($key_,"esp"); 871 &sub ("esp",88); 872 &and ("esp",-16); # align stack 873 &mov (&DWP(80,"esp"),$key_); 874 875 &cmp ($len,1); 876 &je (&label("ctr32_one_shortcut")); 877 878 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 879 880 # compose byte-swap control mask for pshufb on stack 881 &mov (&DWP(0,"esp"),0x0c0d0e0f); 882 &mov (&DWP(4,"esp"),0x08090a0b); 883 &mov (&DWP(8,"esp"),0x04050607); 884 &mov (&DWP(12,"esp"),0x00010203); 885 886 # compose counter increment vector on stack 887 &mov ($rounds,6); 888 &xor ($key_,$key_); 889 &mov (&DWP(16,"esp"),$rounds); 890 &mov (&DWP(20,"esp"),$rounds); 891 &mov (&DWP(24,"esp"),$rounds); 892 &mov (&DWP(28,"esp"),$key_); 893 894 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 895 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 896 897 &mov ($rounds,&DWP(240,$key)); # key->rounds 898 899 # compose 2 vectors of 3x32-bit counters 900 &bswap ($rounds_); 901 &pxor ($rndkey0,$rndkey0); 902 &pxor ($rndkey1,$rndkey1); 903 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 904 &pinsrd ($rndkey0,$rounds_,0); 905 &lea ($key_,&DWP(3,$rounds_)); 906 &pinsrd ($rndkey1,$key_,0); 907 &inc ($rounds_); 908 &pinsrd ($rndkey0,$rounds_,1); 909 &inc ($key_); 910 &pinsrd ($rndkey1,$key_,1); 911 &inc ($rounds_); 912 &pinsrd ($rndkey0,$rounds_,2); 913 &inc ($key_); 914 &pinsrd ($rndkey1,$key_,2); 915 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 916 &pshufb ($rndkey0,$inout0); # byte swap 917 &movdqu ($inout4,&QWP(0,$key)); # key[0] 918 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 919 &pshufb ($rndkey1,$inout0); # byte swap 920 921 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 922 &pshufd ($inout1,$rndkey0,2<<6); 923 &cmp ($len,6); 924 &jb (&label("ctr32_tail")); 925 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 926 &shl ($rounds,4); 927 &mov ($rounds_,16); 928 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 929 &mov ($key_,$key); # backup $key 930 &sub ($rounds_,$rounds); # backup twisted $rounds 931 &lea ($key,&DWP(32,$key,$rounds)); 932 &sub ($len,6); 933 &jmp (&label("ctr32_loop6")); 934 935 &set_label("ctr32_loop6",16); 936 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 937 &pshufd ($inout2,$rndkey0,1<<6); 938 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 939 &pshufd ($inout3,$rndkey1,3<<6); 940 &pxor ($inout0,$rndkey0); # merge counter-less ivec 941 &pshufd ($inout4,$rndkey1,2<<6); 942 &pxor ($inout1,$rndkey0); 943 &pshufd ($inout5,$rndkey1,1<<6); 944 &$movekey ($rndkey1,&QWP(16,$key_)); 945 &pxor ($inout2,$rndkey0); 946 &pxor ($inout3,$rndkey0); 947 &aesenc ($inout0,$rndkey1); 948 &pxor ($inout4,$rndkey0); 949 &pxor ($inout5,$rndkey0); 950 &aesenc ($inout1,$rndkey1); 951 &$movekey ($rndkey0,&QWP(32,$key_)); 952 &mov ($rounds,$rounds_); 953 &aesenc ($inout2,$rndkey1); 954 &aesenc ($inout3,$rndkey1); 955 &aesenc ($inout4,$rndkey1); 956 &aesenc ($inout5,$rndkey1); 957 958 &call (&label("_aesni_encrypt6_enter")); 959 960 &movups ($rndkey1,&QWP(0,$inp)); 961 &movups ($rndkey0,&QWP(0x10,$inp)); 962 &xorps ($inout0,$rndkey1); 963 &movups ($rndkey1,&QWP(0x20,$inp)); 964 &xorps ($inout1,$rndkey0); 965 &movups (&QWP(0,$out),$inout0); 966 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 967 &xorps ($inout2,$rndkey1); 968 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 969 &movups (&QWP(0x10,$out),$inout1); 970 &movups (&QWP(0x20,$out),$inout2); 971 972 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 973 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 974 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 975 976 &movups ($inout1,&QWP(0x30,$inp)); 977 &movups ($inout2,&QWP(0x40,$inp)); 978 &xorps ($inout3,$inout1); 979 &movups ($inout1,&QWP(0x50,$inp)); 980 &lea ($inp,&DWP(0x60,$inp)); 981 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 982 &pshufb ($rndkey0,$inout0); # byte swap 983 &xorps ($inout4,$inout2); 984 &movups (&QWP(0x30,$out),$inout3); 985 &xorps ($inout5,$inout1); 986 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 987 &pshufb ($rndkey1,$inout0); # byte swap 988 &movups (&QWP(0x40,$out),$inout4); 989 &pshufd ($inout0,$rndkey0,3<<6); 990 &movups (&QWP(0x50,$out),$inout5); 991 &lea ($out,&DWP(0x60,$out)); 992 993 &pshufd ($inout1,$rndkey0,2<<6); 994 &sub ($len,6); 995 &jnc (&label("ctr32_loop6")); 996 997 &add ($len,6); 998 &jz (&label("ctr32_ret")); 999 &movdqu ($inout5,&QWP(0,$key_)); 1000 &mov ($key,$key_); 1001 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1002 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1003 1004 &set_label("ctr32_tail"); 1005 &por ($inout0,$inout5); 1006 &cmp ($len,2); 1007 &jb (&label("ctr32_one")); 1008 1009 &pshufd ($inout2,$rndkey0,1<<6); 1010 &por ($inout1,$inout5); 1011 &je (&label("ctr32_two")); 1012 1013 &pshufd ($inout3,$rndkey1,3<<6); 1014 &por ($inout2,$inout5); 1015 &cmp ($len,4); 1016 &jb (&label("ctr32_three")); 1017 1018 &pshufd ($inout4,$rndkey1,2<<6); 1019 &por ($inout3,$inout5); 1020 &je (&label("ctr32_four")); 1021 1022 &por ($inout4,$inout5); 1023 &call ("_aesni_encrypt6"); 1024 &movups ($rndkey1,&QWP(0,$inp)); 1025 &movups ($rndkey0,&QWP(0x10,$inp)); 1026 &xorps ($inout0,$rndkey1); 1027 &movups ($rndkey1,&QWP(0x20,$inp)); 1028 &xorps ($inout1,$rndkey0); 1029 &movups ($rndkey0,&QWP(0x30,$inp)); 1030 &xorps ($inout2,$rndkey1); 1031 &movups ($rndkey1,&QWP(0x40,$inp)); 1032 &xorps ($inout3,$rndkey0); 1033 &movups (&QWP(0,$out),$inout0); 1034 &xorps ($inout4,$rndkey1); 1035 &movups (&QWP(0x10,$out),$inout1); 1036 &movups (&QWP(0x20,$out),$inout2); 1037 &movups (&QWP(0x30,$out),$inout3); 1038 &movups (&QWP(0x40,$out),$inout4); 1039 &jmp (&label("ctr32_ret")); 1040 1041 &set_label("ctr32_one_shortcut",16); 1042 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1043 &mov ($rounds,&DWP(240,$key)); 1044 1045 &set_label("ctr32_one"); 1046 if ($inline) 1047 { &aesni_inline_generate1("enc"); } 1048 else 1049 { &call ("_aesni_encrypt1"); } 1050 &movups ($in0,&QWP(0,$inp)); 1051 &xorps ($in0,$inout0); 1052 &movups (&QWP(0,$out),$in0); 1053 &jmp (&label("ctr32_ret")); 1054 1055 &set_label("ctr32_two",16); 1056 &call ("_aesni_encrypt2"); 1057 &movups ($inout3,&QWP(0,$inp)); 1058 &movups ($inout4,&QWP(0x10,$inp)); 1059 &xorps ($inout0,$inout3); 1060 &xorps ($inout1,$inout4); 1061 &movups (&QWP(0,$out),$inout0); 1062 &movups (&QWP(0x10,$out),$inout1); 1063 &jmp (&label("ctr32_ret")); 1064 1065 &set_label("ctr32_three",16); 1066 &call ("_aesni_encrypt3"); 1067 &movups ($inout3,&QWP(0,$inp)); 1068 &movups ($inout4,&QWP(0x10,$inp)); 1069 &xorps ($inout0,$inout3); 1070 &movups ($inout5,&QWP(0x20,$inp)); 1071 &xorps ($inout1,$inout4); 1072 &movups (&QWP(0,$out),$inout0); 1073 &xorps ($inout2,$inout5); 1074 &movups (&QWP(0x10,$out),$inout1); 1075 &movups (&QWP(0x20,$out),$inout2); 1076 &jmp (&label("ctr32_ret")); 1077 1078 &set_label("ctr32_four",16); 1079 &call ("_aesni_encrypt4"); 1080 &movups ($inout4,&QWP(0,$inp)); 1081 &movups ($inout5,&QWP(0x10,$inp)); 1082 &movups ($rndkey1,&QWP(0x20,$inp)); 1083 &xorps ($inout0,$inout4); 1084 &movups ($rndkey0,&QWP(0x30,$inp)); 1085 &xorps ($inout1,$inout5); 1086 &movups (&QWP(0,$out),$inout0); 1087 &xorps ($inout2,$rndkey1); 1088 &movups (&QWP(0x10,$out),$inout1); 1089 &xorps ($inout3,$rndkey0); 1090 &movups (&QWP(0x20,$out),$inout2); 1091 &movups (&QWP(0x30,$out),$inout3); 1092 1093 &set_label("ctr32_ret"); 1094 &pxor ("xmm0","xmm0"); # clear register bank 1095 &pxor ("xmm1","xmm1"); 1096 &pxor ("xmm2","xmm2"); 1097 &pxor ("xmm3","xmm3"); 1098 &pxor ("xmm4","xmm4"); 1099 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1100 &pxor ("xmm5","xmm5"); 1101 &movdqa (&QWP(48,"esp"),"xmm0"); 1102 &pxor ("xmm6","xmm6"); 1103 &movdqa (&QWP(64,"esp"),"xmm0"); 1104 &pxor ("xmm7","xmm7"); 1105 &mov ("esp",&DWP(80,"esp")); 1106 &function_end("aesni_ctr32_encrypt_blocks"); 1107 1109 ###################################################################### 1110 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1111 # const AES_KEY *key1, const AES_KEY *key2 1112 # const unsigned char iv[16]); 1113 # 1114 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1115 1116 &function_begin("aesni_xts_encrypt"); 1117 &mov ($key,&wparam(4)); # key2 1118 &mov ($inp,&wparam(5)); # clear-text tweak 1119 1120 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1121 &movups ($inout0,&QWP(0,$inp)); 1122 if ($inline) 1123 { &aesni_inline_generate1("enc"); } 1124 else 1125 { &call ("_aesni_encrypt1"); } 1126 1127 &mov ($inp,&wparam(0)); 1128 &mov ($out,&wparam(1)); 1129 &mov ($len,&wparam(2)); 1130 &mov ($key,&wparam(3)); # key1 1131 1132 &mov ($key_,"esp"); 1133 &sub ("esp",16*7+8); 1134 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1135 &and ("esp",-16); # align stack 1136 1137 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1138 &mov (&DWP(16*6+4,"esp"),0); 1139 &mov (&DWP(16*6+8,"esp"),1); 1140 &mov (&DWP(16*6+12,"esp"),0); 1141 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1142 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1143 1144 &movdqa ($tweak,$inout0); 1145 &pxor ($twtmp,$twtmp); 1146 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1147 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1148 1149 &and ($len,-16); 1150 &mov ($key_,$key); # backup $key 1151 &mov ($rounds_,$rounds); # backup $rounds 1152 &sub ($len,16*6); 1153 &jc (&label("xts_enc_short")); 1154 1155 &shl ($rounds,4); 1156 &mov ($rounds_,16); 1157 &sub ($rounds_,$rounds); 1158 &lea ($key,&DWP(32,$key,$rounds)); 1159 &jmp (&label("xts_enc_loop6")); 1160 1161 &set_label("xts_enc_loop6",16); 1162 for ($i=0;$i<4;$i++) { 1163 &pshufd ($twres,$twtmp,0x13); 1164 &pxor ($twtmp,$twtmp); 1165 &movdqa (&QWP(16*$i,"esp"),$tweak); 1166 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1167 &pand ($twres,$twmask); # isolate carry and residue 1168 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1169 &pxor ($tweak,$twres); 1170 } 1171 &pshufd ($inout5,$twtmp,0x13); 1172 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1173 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1174 &$movekey ($rndkey0,&QWP(0,$key_)); 1175 &pand ($inout5,$twmask); # isolate carry and residue 1176 &movups ($inout0,&QWP(0,$inp)); # load input 1177 &pxor ($inout5,$tweak); 1178 1179 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1180 &mov ($rounds,$rounds_); # restore $rounds 1181 &movdqu ($inout1,&QWP(16*1,$inp)); 1182 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1183 &movdqu ($inout2,&QWP(16*2,$inp)); 1184 &pxor ($inout1,$rndkey0); 1185 &movdqu ($inout3,&QWP(16*3,$inp)); 1186 &pxor ($inout2,$rndkey0); 1187 &movdqu ($inout4,&QWP(16*4,$inp)); 1188 &pxor ($inout3,$rndkey0); 1189 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1190 &pxor ($inout4,$rndkey0); 1191 &lea ($inp,&DWP(16*6,$inp)); 1192 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1193 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1194 &pxor ($inout5,$rndkey1); 1195 1196 &$movekey ($rndkey1,&QWP(16,$key_)); 1197 &pxor ($inout1,&QWP(16*1,"esp")); 1198 &pxor ($inout2,&QWP(16*2,"esp")); 1199 &aesenc ($inout0,$rndkey1); 1200 &pxor ($inout3,&QWP(16*3,"esp")); 1201 &pxor ($inout4,&QWP(16*4,"esp")); 1202 &aesenc ($inout1,$rndkey1); 1203 &pxor ($inout5,$rndkey0); 1204 &$movekey ($rndkey0,&QWP(32,$key_)); 1205 &aesenc ($inout2,$rndkey1); 1206 &aesenc ($inout3,$rndkey1); 1207 &aesenc ($inout4,$rndkey1); 1208 &aesenc ($inout5,$rndkey1); 1209 &call (&label("_aesni_encrypt6_enter")); 1210 1211 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1212 &pxor ($twtmp,$twtmp); 1213 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1214 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1215 &xorps ($inout1,&QWP(16*1,"esp")); 1216 &movups (&QWP(16*0,$out),$inout0); # write output 1217 &xorps ($inout2,&QWP(16*2,"esp")); 1218 &movups (&QWP(16*1,$out),$inout1); 1219 &xorps ($inout3,&QWP(16*3,"esp")); 1220 &movups (&QWP(16*2,$out),$inout2); 1221 &xorps ($inout4,&QWP(16*4,"esp")); 1222 &movups (&QWP(16*3,$out),$inout3); 1223 &xorps ($inout5,$tweak); 1224 &movups (&QWP(16*4,$out),$inout4); 1225 &pshufd ($twres,$twtmp,0x13); 1226 &movups (&QWP(16*5,$out),$inout5); 1227 &lea ($out,&DWP(16*6,$out)); 1228 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1229 1230 &pxor ($twtmp,$twtmp); 1231 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1232 &pand ($twres,$twmask); # isolate carry and residue 1233 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1234 &pxor ($tweak,$twres); 1235 1236 &sub ($len,16*6); 1237 &jnc (&label("xts_enc_loop6")); 1238 1239 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1240 &mov ($key,$key_); # restore $key 1241 &mov ($rounds_,$rounds); 1242 1243 &set_label("xts_enc_short"); 1244 &add ($len,16*6); 1245 &jz (&label("xts_enc_done6x")); 1246 1247 &movdqa ($inout3,$tweak); # put aside previous tweak 1248 &cmp ($len,0x20); 1249 &jb (&label("xts_enc_one")); 1250 1251 &pshufd ($twres,$twtmp,0x13); 1252 &pxor ($twtmp,$twtmp); 1253 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1254 &pand ($twres,$twmask); # isolate carry and residue 1255 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1256 &pxor ($tweak,$twres); 1257 &je (&label("xts_enc_two")); 1258 1259 &pshufd ($twres,$twtmp,0x13); 1260 &pxor ($twtmp,$twtmp); 1261 &movdqa ($inout4,$tweak); # put aside previous tweak 1262 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1263 &pand ($twres,$twmask); # isolate carry and residue 1264 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1265 &pxor ($tweak,$twres); 1266 &cmp ($len,0x40); 1267 &jb (&label("xts_enc_three")); 1268 1269 &pshufd ($twres,$twtmp,0x13); 1270 &pxor ($twtmp,$twtmp); 1271 &movdqa ($inout5,$tweak); # put aside previous tweak 1272 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1273 &pand ($twres,$twmask); # isolate carry and residue 1274 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1275 &pxor ($tweak,$twres); 1276 &movdqa (&QWP(16*0,"esp"),$inout3); 1277 &movdqa (&QWP(16*1,"esp"),$inout4); 1278 &je (&label("xts_enc_four")); 1279 1280 &movdqa (&QWP(16*2,"esp"),$inout5); 1281 &pshufd ($inout5,$twtmp,0x13); 1282 &movdqa (&QWP(16*3,"esp"),$tweak); 1283 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1284 &pand ($inout5,$twmask); # isolate carry and residue 1285 &pxor ($inout5,$tweak); 1286 1287 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1288 &movdqu ($inout1,&QWP(16*1,$inp)); 1289 &movdqu ($inout2,&QWP(16*2,$inp)); 1290 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1291 &movdqu ($inout3,&QWP(16*3,$inp)); 1292 &pxor ($inout1,&QWP(16*1,"esp")); 1293 &movdqu ($inout4,&QWP(16*4,$inp)); 1294 &pxor ($inout2,&QWP(16*2,"esp")); 1295 &lea ($inp,&DWP(16*5,$inp)); 1296 &pxor ($inout3,&QWP(16*3,"esp")); 1297 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1298 &pxor ($inout4,$inout5); 1299 1300 &call ("_aesni_encrypt6"); 1301 1302 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1303 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1304 &xorps ($inout1,&QWP(16*1,"esp")); 1305 &xorps ($inout2,&QWP(16*2,"esp")); 1306 &movups (&QWP(16*0,$out),$inout0); # write output 1307 &xorps ($inout3,&QWP(16*3,"esp")); 1308 &movups (&QWP(16*1,$out),$inout1); 1309 &xorps ($inout4,$tweak); 1310 &movups (&QWP(16*2,$out),$inout2); 1311 &movups (&QWP(16*3,$out),$inout3); 1312 &movups (&QWP(16*4,$out),$inout4); 1313 &lea ($out,&DWP(16*5,$out)); 1314 &jmp (&label("xts_enc_done")); 1315 1316 &set_label("xts_enc_one",16); 1317 &movups ($inout0,&QWP(16*0,$inp)); # load input 1318 &lea ($inp,&DWP(16*1,$inp)); 1319 &xorps ($inout0,$inout3); # input^=tweak 1320 if ($inline) 1321 { &aesni_inline_generate1("enc"); } 1322 else 1323 { &call ("_aesni_encrypt1"); } 1324 &xorps ($inout0,$inout3); # output^=tweak 1325 &movups (&QWP(16*0,$out),$inout0); # write output 1326 &lea ($out,&DWP(16*1,$out)); 1327 1328 &movdqa ($tweak,$inout3); # last tweak 1329 &jmp (&label("xts_enc_done")); 1330 1331 &set_label("xts_enc_two",16); 1332 &movaps ($inout4,$tweak); # put aside last tweak 1333 1334 &movups ($inout0,&QWP(16*0,$inp)); # load input 1335 &movups ($inout1,&QWP(16*1,$inp)); 1336 &lea ($inp,&DWP(16*2,$inp)); 1337 &xorps ($inout0,$inout3); # input^=tweak 1338 &xorps ($inout1,$inout4); 1339 1340 &call ("_aesni_encrypt2"); 1341 1342 &xorps ($inout0,$inout3); # output^=tweak 1343 &xorps ($inout1,$inout4); 1344 &movups (&QWP(16*0,$out),$inout0); # write output 1345 &movups (&QWP(16*1,$out),$inout1); 1346 &lea ($out,&DWP(16*2,$out)); 1347 1348 &movdqa ($tweak,$inout4); # last tweak 1349 &jmp (&label("xts_enc_done")); 1350 1351 &set_label("xts_enc_three",16); 1352 &movaps ($inout5,$tweak); # put aside last tweak 1353 &movups ($inout0,&QWP(16*0,$inp)); # load input 1354 &movups ($inout1,&QWP(16*1,$inp)); 1355 &movups ($inout2,&QWP(16*2,$inp)); 1356 &lea ($inp,&DWP(16*3,$inp)); 1357 &xorps ($inout0,$inout3); # input^=tweak 1358 &xorps ($inout1,$inout4); 1359 &xorps ($inout2,$inout5); 1360 1361 &call ("_aesni_encrypt3"); 1362 1363 &xorps ($inout0,$inout3); # output^=tweak 1364 &xorps ($inout1,$inout4); 1365 &xorps ($inout2,$inout5); 1366 &movups (&QWP(16*0,$out),$inout0); # write output 1367 &movups (&QWP(16*1,$out),$inout1); 1368 &movups (&QWP(16*2,$out),$inout2); 1369 &lea ($out,&DWP(16*3,$out)); 1370 1371 &movdqa ($tweak,$inout5); # last tweak 1372 &jmp (&label("xts_enc_done")); 1373 1374 &set_label("xts_enc_four",16); 1375 &movaps ($inout4,$tweak); # put aside last tweak 1376 1377 &movups ($inout0,&QWP(16*0,$inp)); # load input 1378 &movups ($inout1,&QWP(16*1,$inp)); 1379 &movups ($inout2,&QWP(16*2,$inp)); 1380 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1381 &movups ($inout3,&QWP(16*3,$inp)); 1382 &lea ($inp,&DWP(16*4,$inp)); 1383 &xorps ($inout1,&QWP(16*1,"esp")); 1384 &xorps ($inout2,$inout5); 1385 &xorps ($inout3,$inout4); 1386 1387 &call ("_aesni_encrypt4"); 1388 1389 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1390 &xorps ($inout1,&QWP(16*1,"esp")); 1391 &xorps ($inout2,$inout5); 1392 &movups (&QWP(16*0,$out),$inout0); # write output 1393 &xorps ($inout3,$inout4); 1394 &movups (&QWP(16*1,$out),$inout1); 1395 &movups (&QWP(16*2,$out),$inout2); 1396 &movups (&QWP(16*3,$out),$inout3); 1397 &lea ($out,&DWP(16*4,$out)); 1398 1399 &movdqa ($tweak,$inout4); # last tweak 1400 &jmp (&label("xts_enc_done")); 1401 1402 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1403 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1404 &and ($len,15); 1405 &jz (&label("xts_enc_ret")); 1406 &movdqa ($inout3,$tweak); 1407 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1408 &jmp (&label("xts_enc_steal")); 1409 1410 &set_label("xts_enc_done",16); 1411 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1412 &pxor ($twtmp,$twtmp); 1413 &and ($len,15); 1414 &jz (&label("xts_enc_ret")); 1415 1416 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1417 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1418 &pshufd ($inout3,$twtmp,0x13); 1419 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1420 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1421 &pxor ($inout3,$tweak); 1422 1423 &set_label("xts_enc_steal"); 1424 &movz ($rounds,&BP(0,$inp)); 1425 &movz ($key,&BP(-16,$out)); 1426 &lea ($inp,&DWP(1,$inp)); 1427 &mov (&BP(-16,$out),&LB($rounds)); 1428 &mov (&BP(0,$out),&LB($key)); 1429 &lea ($out,&DWP(1,$out)); 1430 &sub ($len,1); 1431 &jnz (&label("xts_enc_steal")); 1432 1433 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1434 &mov ($key,$key_); # restore $key 1435 &mov ($rounds,$rounds_); # restore $rounds 1436 1437 &movups ($inout0,&QWP(-16,$out)); # load input 1438 &xorps ($inout0,$inout3); # input^=tweak 1439 if ($inline) 1440 { &aesni_inline_generate1("enc"); } 1441 else 1442 { &call ("_aesni_encrypt1"); } 1443 &xorps ($inout0,$inout3); # output^=tweak 1444 &movups (&QWP(-16,$out),$inout0); # write output 1445 1446 &set_label("xts_enc_ret"); 1447 &pxor ("xmm0","xmm0"); # clear register bank 1448 &pxor ("xmm1","xmm1"); 1449 &pxor ("xmm2","xmm2"); 1450 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1451 &pxor ("xmm3","xmm3"); 1452 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1453 &pxor ("xmm4","xmm4"); 1454 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1455 &pxor ("xmm5","xmm5"); 1456 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1457 &pxor ("xmm6","xmm6"); 1458 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1459 &pxor ("xmm7","xmm7"); 1460 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1461 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1462 &function_end("aesni_xts_encrypt"); 1463 1464 &function_begin("aesni_xts_decrypt"); 1465 &mov ($key,&wparam(4)); # key2 1466 &mov ($inp,&wparam(5)); # clear-text tweak 1467 1468 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1469 &movups ($inout0,&QWP(0,$inp)); 1470 if ($inline) 1471 { &aesni_inline_generate1("enc"); } 1472 else 1473 { &call ("_aesni_encrypt1"); } 1474 1475 &mov ($inp,&wparam(0)); 1476 &mov ($out,&wparam(1)); 1477 &mov ($len,&wparam(2)); 1478 &mov ($key,&wparam(3)); # key1 1479 1480 &mov ($key_,"esp"); 1481 &sub ("esp",16*7+8); 1482 &and ("esp",-16); # align stack 1483 1484 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1485 &test ($len,15); 1486 &setnz (&LB($rounds_)); 1487 &shl ($rounds_,4); 1488 &sub ($len,$rounds_); 1489 1490 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1491 &mov (&DWP(16*6+4,"esp"),0); 1492 &mov (&DWP(16*6+8,"esp"),1); 1493 &mov (&DWP(16*6+12,"esp"),0); 1494 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1495 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1496 1497 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1498 &mov ($key_,$key); # backup $key 1499 &mov ($rounds_,$rounds); # backup $rounds 1500 1501 &movdqa ($tweak,$inout0); 1502 &pxor ($twtmp,$twtmp); 1503 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1504 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1505 1506 &and ($len,-16); 1507 &sub ($len,16*6); 1508 &jc (&label("xts_dec_short")); 1509 1510 &shl ($rounds,4); 1511 &mov ($rounds_,16); 1512 &sub ($rounds_,$rounds); 1513 &lea ($key,&DWP(32,$key,$rounds)); 1514 &jmp (&label("xts_dec_loop6")); 1515 1516 &set_label("xts_dec_loop6",16); 1517 for ($i=0;$i<4;$i++) { 1518 &pshufd ($twres,$twtmp,0x13); 1519 &pxor ($twtmp,$twtmp); 1520 &movdqa (&QWP(16*$i,"esp"),$tweak); 1521 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1522 &pand ($twres,$twmask); # isolate carry and residue 1523 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1524 &pxor ($tweak,$twres); 1525 } 1526 &pshufd ($inout5,$twtmp,0x13); 1527 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1528 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1529 &$movekey ($rndkey0,&QWP(0,$key_)); 1530 &pand ($inout5,$twmask); # isolate carry and residue 1531 &movups ($inout0,&QWP(0,$inp)); # load input 1532 &pxor ($inout5,$tweak); 1533 1534 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1535 &mov ($rounds,$rounds_); 1536 &movdqu ($inout1,&QWP(16*1,$inp)); 1537 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1538 &movdqu ($inout2,&QWP(16*2,$inp)); 1539 &pxor ($inout1,$rndkey0); 1540 &movdqu ($inout3,&QWP(16*3,$inp)); 1541 &pxor ($inout2,$rndkey0); 1542 &movdqu ($inout4,&QWP(16*4,$inp)); 1543 &pxor ($inout3,$rndkey0); 1544 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1545 &pxor ($inout4,$rndkey0); 1546 &lea ($inp,&DWP(16*6,$inp)); 1547 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1548 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1549 &pxor ($inout5,$rndkey1); 1550 1551 &$movekey ($rndkey1,&QWP(16,$key_)); 1552 &pxor ($inout1,&QWP(16*1,"esp")); 1553 &pxor ($inout2,&QWP(16*2,"esp")); 1554 &aesdec ($inout0,$rndkey1); 1555 &pxor ($inout3,&QWP(16*3,"esp")); 1556 &pxor ($inout4,&QWP(16*4,"esp")); 1557 &aesdec ($inout1,$rndkey1); 1558 &pxor ($inout5,$rndkey0); 1559 &$movekey ($rndkey0,&QWP(32,$key_)); 1560 &aesdec ($inout2,$rndkey1); 1561 &aesdec ($inout3,$rndkey1); 1562 &aesdec ($inout4,$rndkey1); 1563 &aesdec ($inout5,$rndkey1); 1564 &call (&label("_aesni_decrypt6_enter")); 1565 1566 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1567 &pxor ($twtmp,$twtmp); 1568 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1569 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1570 &xorps ($inout1,&QWP(16*1,"esp")); 1571 &movups (&QWP(16*0,$out),$inout0); # write output 1572 &xorps ($inout2,&QWP(16*2,"esp")); 1573 &movups (&QWP(16*1,$out),$inout1); 1574 &xorps ($inout3,&QWP(16*3,"esp")); 1575 &movups (&QWP(16*2,$out),$inout2); 1576 &xorps ($inout4,&QWP(16*4,"esp")); 1577 &movups (&QWP(16*3,$out),$inout3); 1578 &xorps ($inout5,$tweak); 1579 &movups (&QWP(16*4,$out),$inout4); 1580 &pshufd ($twres,$twtmp,0x13); 1581 &movups (&QWP(16*5,$out),$inout5); 1582 &lea ($out,&DWP(16*6,$out)); 1583 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1584 1585 &pxor ($twtmp,$twtmp); 1586 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1587 &pand ($twres,$twmask); # isolate carry and residue 1588 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1589 &pxor ($tweak,$twres); 1590 1591 &sub ($len,16*6); 1592 &jnc (&label("xts_dec_loop6")); 1593 1594 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1595 &mov ($key,$key_); # restore $key 1596 &mov ($rounds_,$rounds); 1597 1598 &set_label("xts_dec_short"); 1599 &add ($len,16*6); 1600 &jz (&label("xts_dec_done6x")); 1601 1602 &movdqa ($inout3,$tweak); # put aside previous tweak 1603 &cmp ($len,0x20); 1604 &jb (&label("xts_dec_one")); 1605 1606 &pshufd ($twres,$twtmp,0x13); 1607 &pxor ($twtmp,$twtmp); 1608 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1609 &pand ($twres,$twmask); # isolate carry and residue 1610 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1611 &pxor ($tweak,$twres); 1612 &je (&label("xts_dec_two")); 1613 1614 &pshufd ($twres,$twtmp,0x13); 1615 &pxor ($twtmp,$twtmp); 1616 &movdqa ($inout4,$tweak); # put aside previous tweak 1617 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1618 &pand ($twres,$twmask); # isolate carry and residue 1619 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1620 &pxor ($tweak,$twres); 1621 &cmp ($len,0x40); 1622 &jb (&label("xts_dec_three")); 1623 1624 &pshufd ($twres,$twtmp,0x13); 1625 &pxor ($twtmp,$twtmp); 1626 &movdqa ($inout5,$tweak); # put aside previous tweak 1627 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1628 &pand ($twres,$twmask); # isolate carry and residue 1629 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1630 &pxor ($tweak,$twres); 1631 &movdqa (&QWP(16*0,"esp"),$inout3); 1632 &movdqa (&QWP(16*1,"esp"),$inout4); 1633 &je (&label("xts_dec_four")); 1634 1635 &movdqa (&QWP(16*2,"esp"),$inout5); 1636 &pshufd ($inout5,$twtmp,0x13); 1637 &movdqa (&QWP(16*3,"esp"),$tweak); 1638 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1639 &pand ($inout5,$twmask); # isolate carry and residue 1640 &pxor ($inout5,$tweak); 1641 1642 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1643 &movdqu ($inout1,&QWP(16*1,$inp)); 1644 &movdqu ($inout2,&QWP(16*2,$inp)); 1645 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1646 &movdqu ($inout3,&QWP(16*3,$inp)); 1647 &pxor ($inout1,&QWP(16*1,"esp")); 1648 &movdqu ($inout4,&QWP(16*4,$inp)); 1649 &pxor ($inout2,&QWP(16*2,"esp")); 1650 &lea ($inp,&DWP(16*5,$inp)); 1651 &pxor ($inout3,&QWP(16*3,"esp")); 1652 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1653 &pxor ($inout4,$inout5); 1654 1655 &call ("_aesni_decrypt6"); 1656 1657 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1658 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1659 &xorps ($inout1,&QWP(16*1,"esp")); 1660 &xorps ($inout2,&QWP(16*2,"esp")); 1661 &movups (&QWP(16*0,$out),$inout0); # write output 1662 &xorps ($inout3,&QWP(16*3,"esp")); 1663 &movups (&QWP(16*1,$out),$inout1); 1664 &xorps ($inout4,$tweak); 1665 &movups (&QWP(16*2,$out),$inout2); 1666 &movups (&QWP(16*3,$out),$inout3); 1667 &movups (&QWP(16*4,$out),$inout4); 1668 &lea ($out,&DWP(16*5,$out)); 1669 &jmp (&label("xts_dec_done")); 1670 1671 &set_label("xts_dec_one",16); 1672 &movups ($inout0,&QWP(16*0,$inp)); # load input 1673 &lea ($inp,&DWP(16*1,$inp)); 1674 &xorps ($inout0,$inout3); # input^=tweak 1675 if ($inline) 1676 { &aesni_inline_generate1("dec"); } 1677 else 1678 { &call ("_aesni_decrypt1"); } 1679 &xorps ($inout0,$inout3); # output^=tweak 1680 &movups (&QWP(16*0,$out),$inout0); # write output 1681 &lea ($out,&DWP(16*1,$out)); 1682 1683 &movdqa ($tweak,$inout3); # last tweak 1684 &jmp (&label("xts_dec_done")); 1685 1686 &set_label("xts_dec_two",16); 1687 &movaps ($inout4,$tweak); # put aside last tweak 1688 1689 &movups ($inout0,&QWP(16*0,$inp)); # load input 1690 &movups ($inout1,&QWP(16*1,$inp)); 1691 &lea ($inp,&DWP(16*2,$inp)); 1692 &xorps ($inout0,$inout3); # input^=tweak 1693 &xorps ($inout1,$inout4); 1694 1695 &call ("_aesni_decrypt2"); 1696 1697 &xorps ($inout0,$inout3); # output^=tweak 1698 &xorps ($inout1,$inout4); 1699 &movups (&QWP(16*0,$out),$inout0); # write output 1700 &movups (&QWP(16*1,$out),$inout1); 1701 &lea ($out,&DWP(16*2,$out)); 1702 1703 &movdqa ($tweak,$inout4); # last tweak 1704 &jmp (&label("xts_dec_done")); 1705 1706 &set_label("xts_dec_three",16); 1707 &movaps ($inout5,$tweak); # put aside last tweak 1708 &movups ($inout0,&QWP(16*0,$inp)); # load input 1709 &movups ($inout1,&QWP(16*1,$inp)); 1710 &movups ($inout2,&QWP(16*2,$inp)); 1711 &lea ($inp,&DWP(16*3,$inp)); 1712 &xorps ($inout0,$inout3); # input^=tweak 1713 &xorps ($inout1,$inout4); 1714 &xorps ($inout2,$inout5); 1715 1716 &call ("_aesni_decrypt3"); 1717 1718 &xorps ($inout0,$inout3); # output^=tweak 1719 &xorps ($inout1,$inout4); 1720 &xorps ($inout2,$inout5); 1721 &movups (&QWP(16*0,$out),$inout0); # write output 1722 &movups (&QWP(16*1,$out),$inout1); 1723 &movups (&QWP(16*2,$out),$inout2); 1724 &lea ($out,&DWP(16*3,$out)); 1725 1726 &movdqa ($tweak,$inout5); # last tweak 1727 &jmp (&label("xts_dec_done")); 1728 1729 &set_label("xts_dec_four",16); 1730 &movaps ($inout4,$tweak); # put aside last tweak 1731 1732 &movups ($inout0,&QWP(16*0,$inp)); # load input 1733 &movups ($inout1,&QWP(16*1,$inp)); 1734 &movups ($inout2,&QWP(16*2,$inp)); 1735 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1736 &movups ($inout3,&QWP(16*3,$inp)); 1737 &lea ($inp,&DWP(16*4,$inp)); 1738 &xorps ($inout1,&QWP(16*1,"esp")); 1739 &xorps ($inout2,$inout5); 1740 &xorps ($inout3,$inout4); 1741 1742 &call ("_aesni_decrypt4"); 1743 1744 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1745 &xorps ($inout1,&QWP(16*1,"esp")); 1746 &xorps ($inout2,$inout5); 1747 &movups (&QWP(16*0,$out),$inout0); # write output 1748 &xorps ($inout3,$inout4); 1749 &movups (&QWP(16*1,$out),$inout1); 1750 &movups (&QWP(16*2,$out),$inout2); 1751 &movups (&QWP(16*3,$out),$inout3); 1752 &lea ($out,&DWP(16*4,$out)); 1753 1754 &movdqa ($tweak,$inout4); # last tweak 1755 &jmp (&label("xts_dec_done")); 1756 1757 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1758 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1759 &and ($len,15); 1760 &jz (&label("xts_dec_ret")); 1761 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1762 &jmp (&label("xts_dec_only_one_more")); 1763 1764 &set_label("xts_dec_done",16); 1765 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1766 &pxor ($twtmp,$twtmp); 1767 &and ($len,15); 1768 &jz (&label("xts_dec_ret")); 1769 1770 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1771 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1772 &pshufd ($twres,$twtmp,0x13); 1773 &pxor ($twtmp,$twtmp); 1774 &movdqa ($twmask,&QWP(16*6,"esp")); 1775 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1776 &pand ($twres,$twmask); # isolate carry and residue 1777 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1778 &pxor ($tweak,$twres); 1779 1780 &set_label("xts_dec_only_one_more"); 1781 &pshufd ($inout3,$twtmp,0x13); 1782 &movdqa ($inout4,$tweak); # put aside previous tweak 1783 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1784 &pand ($inout3,$twmask); # isolate carry and residue 1785 &pxor ($inout3,$tweak); 1786 1787 &mov ($key,$key_); # restore $key 1788 &mov ($rounds,$rounds_); # restore $rounds 1789 1790 &movups ($inout0,&QWP(0,$inp)); # load input 1791 &xorps ($inout0,$inout3); # input^=tweak 1792 if ($inline) 1793 { &aesni_inline_generate1("dec"); } 1794 else 1795 { &call ("_aesni_decrypt1"); } 1796 &xorps ($inout0,$inout3); # output^=tweak 1797 &movups (&QWP(0,$out),$inout0); # write output 1798 1799 &set_label("xts_dec_steal"); 1800 &movz ($rounds,&BP(16,$inp)); 1801 &movz ($key,&BP(0,$out)); 1802 &lea ($inp,&DWP(1,$inp)); 1803 &mov (&BP(0,$out),&LB($rounds)); 1804 &mov (&BP(16,$out),&LB($key)); 1805 &lea ($out,&DWP(1,$out)); 1806 &sub ($len,1); 1807 &jnz (&label("xts_dec_steal")); 1808 1809 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1810 &mov ($key,$key_); # restore $key 1811 &mov ($rounds,$rounds_); # restore $rounds 1812 1813 &movups ($inout0,&QWP(0,$out)); # load input 1814 &xorps ($inout0,$inout4); # input^=tweak 1815 if ($inline) 1816 { &aesni_inline_generate1("dec"); } 1817 else 1818 { &call ("_aesni_decrypt1"); } 1819 &xorps ($inout0,$inout4); # output^=tweak 1820 &movups (&QWP(0,$out),$inout0); # write output 1821 1822 &set_label("xts_dec_ret"); 1823 &pxor ("xmm0","xmm0"); # clear register bank 1824 &pxor ("xmm1","xmm1"); 1825 &pxor ("xmm2","xmm2"); 1826 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1827 &pxor ("xmm3","xmm3"); 1828 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1829 &pxor ("xmm4","xmm4"); 1830 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1831 &pxor ("xmm5","xmm5"); 1832 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1833 &pxor ("xmm6","xmm6"); 1834 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1835 &pxor ("xmm7","xmm7"); 1836 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1837 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1838 &function_end("aesni_xts_decrypt"); 1839 } 1840 } 1841 1843 ###################################################################### 1844 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 1845 # size_t length, const AES_KEY *key, 1846 # unsigned char *ivp,const int enc); 1847 &function_begin("${PREFIX}_cbc_encrypt"); 1848 &mov ($inp,&wparam(0)); 1849 &mov ($rounds_,"esp"); 1850 &mov ($out,&wparam(1)); 1851 &sub ($rounds_,24); 1852 &mov ($len,&wparam(2)); 1853 &and ($rounds_,-16); 1854 &mov ($key,&wparam(3)); 1855 &mov ($key_,&wparam(4)); 1856 &test ($len,$len); 1857 &jz (&label("cbc_abort")); 1858 1859 &cmp (&wparam(5),0); 1860 &xchg ($rounds_,"esp"); # alloca 1861 &movups ($ivec,&QWP(0,$key_)); # load IV 1862 &mov ($rounds,&DWP(240,$key)); 1863 &mov ($key_,$key); # backup $key 1864 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1865 &mov ($rounds_,$rounds); # backup $rounds 1866 &je (&label("cbc_decrypt")); 1867 1868 &movaps ($inout0,$ivec); 1869 &cmp ($len,16); 1870 &jb (&label("cbc_enc_tail")); 1871 &sub ($len,16); 1872 &jmp (&label("cbc_enc_loop")); 1873 1874 &set_label("cbc_enc_loop",16); 1875 &movups ($ivec,&QWP(0,$inp)); # input actually 1876 &lea ($inp,&DWP(16,$inp)); 1877 if ($inline) 1878 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1879 else 1880 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1881 &mov ($rounds,$rounds_); # restore $rounds 1882 &mov ($key,$key_); # restore $key 1883 &movups (&QWP(0,$out),$inout0); # store output 1884 &lea ($out,&DWP(16,$out)); 1885 &sub ($len,16); 1886 &jnc (&label("cbc_enc_loop")); 1887 &add ($len,16); 1888 &jnz (&label("cbc_enc_tail")); 1889 &movaps ($ivec,$inout0); 1890 &pxor ($inout0,$inout0); 1891 &jmp (&label("cbc_ret")); 1892 1893 &set_label("cbc_enc_tail"); 1894 &mov ("ecx",$len); # zaps $rounds 1895 &data_word(0xA4F3F689); # rep movsb 1896 &mov ("ecx",16); # zero tail 1897 &sub ("ecx",$len); 1898 &xor ("eax","eax"); # zaps $len 1899 &data_word(0xAAF3F689); # rep stosb 1900 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1901 &mov ($rounds,$rounds_); # restore $rounds 1902 &mov ($inp,$out); # $inp and $out are the same 1903 &mov ($key,$key_); # restore $key 1904 &jmp (&label("cbc_enc_loop")); 1905 ###################################################################### 1906 &set_label("cbc_decrypt",16); 1907 &cmp ($len,0x50); 1908 &jbe (&label("cbc_dec_tail")); 1909 &movaps (&QWP(0,"esp"),$ivec); # save IV 1910 &sub ($len,0x50); 1911 &jmp (&label("cbc_dec_loop6_enter")); 1912 1913 &set_label("cbc_dec_loop6",16); 1914 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1915 &movups (&QWP(0,$out),$inout5); 1916 &lea ($out,&DWP(0x10,$out)); 1917 &set_label("cbc_dec_loop6_enter"); 1918 &movdqu ($inout0,&QWP(0,$inp)); 1919 &movdqu ($inout1,&QWP(0x10,$inp)); 1920 &movdqu ($inout2,&QWP(0x20,$inp)); 1921 &movdqu ($inout3,&QWP(0x30,$inp)); 1922 &movdqu ($inout4,&QWP(0x40,$inp)); 1923 &movdqu ($inout5,&QWP(0x50,$inp)); 1924 1925 &call ("_aesni_decrypt6"); 1926 1927 &movups ($rndkey1,&QWP(0,$inp)); 1928 &movups ($rndkey0,&QWP(0x10,$inp)); 1929 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1930 &xorps ($inout1,$rndkey1); 1931 &movups ($rndkey1,&QWP(0x20,$inp)); 1932 &xorps ($inout2,$rndkey0); 1933 &movups ($rndkey0,&QWP(0x30,$inp)); 1934 &xorps ($inout3,$rndkey1); 1935 &movups ($rndkey1,&QWP(0x40,$inp)); 1936 &xorps ($inout4,$rndkey0); 1937 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1938 &xorps ($inout5,$rndkey1); 1939 &movups (&QWP(0,$out),$inout0); 1940 &movups (&QWP(0x10,$out),$inout1); 1941 &lea ($inp,&DWP(0x60,$inp)); 1942 &movups (&QWP(0x20,$out),$inout2); 1943 &mov ($rounds,$rounds_); # restore $rounds 1944 &movups (&QWP(0x30,$out),$inout3); 1945 &mov ($key,$key_); # restore $key 1946 &movups (&QWP(0x40,$out),$inout4); 1947 &lea ($out,&DWP(0x50,$out)); 1948 &sub ($len,0x60); 1949 &ja (&label("cbc_dec_loop6")); 1950 1951 &movaps ($inout0,$inout5); 1952 &movaps ($ivec,$rndkey0); 1953 &add ($len,0x50); 1954 &jle (&label("cbc_dec_clear_tail_collected")); 1955 &movups (&QWP(0,$out),$inout0); 1956 &lea ($out,&DWP(0x10,$out)); 1957 &set_label("cbc_dec_tail"); 1958 &movups ($inout0,&QWP(0,$inp)); 1959 &movaps ($in0,$inout0); 1960 &cmp ($len,0x10); 1961 &jbe (&label("cbc_dec_one")); 1962 1963 &movups ($inout1,&QWP(0x10,$inp)); 1964 &movaps ($in1,$inout1); 1965 &cmp ($len,0x20); 1966 &jbe (&label("cbc_dec_two")); 1967 1968 &movups ($inout2,&QWP(0x20,$inp)); 1969 &cmp ($len,0x30); 1970 &jbe (&label("cbc_dec_three")); 1971 1972 &movups ($inout3,&QWP(0x30,$inp)); 1973 &cmp ($len,0x40); 1974 &jbe (&label("cbc_dec_four")); 1975 1976 &movups ($inout4,&QWP(0x40,$inp)); 1977 &movaps (&QWP(0,"esp"),$ivec); # save IV 1978 &movups ($inout0,&QWP(0,$inp)); 1979 &xorps ($inout5,$inout5); 1980 &call ("_aesni_decrypt6"); 1981 &movups ($rndkey1,&QWP(0,$inp)); 1982 &movups ($rndkey0,&QWP(0x10,$inp)); 1983 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1984 &xorps ($inout1,$rndkey1); 1985 &movups ($rndkey1,&QWP(0x20,$inp)); 1986 &xorps ($inout2,$rndkey0); 1987 &movups ($rndkey0,&QWP(0x30,$inp)); 1988 &xorps ($inout3,$rndkey1); 1989 &movups ($ivec,&QWP(0x40,$inp)); # IV 1990 &xorps ($inout4,$rndkey0); 1991 &movups (&QWP(0,$out),$inout0); 1992 &movups (&QWP(0x10,$out),$inout1); 1993 &pxor ($inout1,$inout1); 1994 &movups (&QWP(0x20,$out),$inout2); 1995 &pxor ($inout2,$inout2); 1996 &movups (&QWP(0x30,$out),$inout3); 1997 &pxor ($inout3,$inout3); 1998 &lea ($out,&DWP(0x40,$out)); 1999 &movaps ($inout0,$inout4); 2000 &pxor ($inout4,$inout4); 2001 &sub ($len,0x50); 2002 &jmp (&label("cbc_dec_tail_collected")); 2003 2004 &set_label("cbc_dec_one",16); 2005 if ($inline) 2006 { &aesni_inline_generate1("dec"); } 2007 else 2008 { &call ("_aesni_decrypt1"); } 2009 &xorps ($inout0,$ivec); 2010 &movaps ($ivec,$in0); 2011 &sub ($len,0x10); 2012 &jmp (&label("cbc_dec_tail_collected")); 2013 2014 &set_label("cbc_dec_two",16); 2015 &call ("_aesni_decrypt2"); 2016 &xorps ($inout0,$ivec); 2017 &xorps ($inout1,$in0); 2018 &movups (&QWP(0,$out),$inout0); 2019 &movaps ($inout0,$inout1); 2020 &pxor ($inout1,$inout1); 2021 &lea ($out,&DWP(0x10,$out)); 2022 &movaps ($ivec,$in1); 2023 &sub ($len,0x20); 2024 &jmp (&label("cbc_dec_tail_collected")); 2025 2026 &set_label("cbc_dec_three",16); 2027 &call ("_aesni_decrypt3"); 2028 &xorps ($inout0,$ivec); 2029 &xorps ($inout1,$in0); 2030 &xorps ($inout2,$in1); 2031 &movups (&QWP(0,$out),$inout0); 2032 &movaps ($inout0,$inout2); 2033 &pxor ($inout2,$inout2); 2034 &movups (&QWP(0x10,$out),$inout1); 2035 &pxor ($inout1,$inout1); 2036 &lea ($out,&DWP(0x20,$out)); 2037 &movups ($ivec,&QWP(0x20,$inp)); 2038 &sub ($len,0x30); 2039 &jmp (&label("cbc_dec_tail_collected")); 2040 2041 &set_label("cbc_dec_four",16); 2042 &call ("_aesni_decrypt4"); 2043 &movups ($rndkey1,&QWP(0x10,$inp)); 2044 &movups ($rndkey0,&QWP(0x20,$inp)); 2045 &xorps ($inout0,$ivec); 2046 &movups ($ivec,&QWP(0x30,$inp)); 2047 &xorps ($inout1,$in0); 2048 &movups (&QWP(0,$out),$inout0); 2049 &xorps ($inout2,$rndkey1); 2050 &movups (&QWP(0x10,$out),$inout1); 2051 &pxor ($inout1,$inout1); 2052 &xorps ($inout3,$rndkey0); 2053 &movups (&QWP(0x20,$out),$inout2); 2054 &pxor ($inout2,$inout2); 2055 &lea ($out,&DWP(0x30,$out)); 2056 &movaps ($inout0,$inout3); 2057 &pxor ($inout3,$inout3); 2058 &sub ($len,0x40); 2059 &jmp (&label("cbc_dec_tail_collected")); 2060 2061 &set_label("cbc_dec_clear_tail_collected",16); 2062 &pxor ($inout1,$inout1); 2063 &pxor ($inout2,$inout2); 2064 &pxor ($inout3,$inout3); 2065 &pxor ($inout4,$inout4); 2066 &set_label("cbc_dec_tail_collected"); 2067 &and ($len,15); 2068 &jnz (&label("cbc_dec_tail_partial")); 2069 &movups (&QWP(0,$out),$inout0); 2070 &pxor ($rndkey0,$rndkey0); 2071 &jmp (&label("cbc_ret")); 2072 2073 &set_label("cbc_dec_tail_partial",16); 2074 &movaps (&QWP(0,"esp"),$inout0); 2075 &pxor ($rndkey0,$rndkey0); 2076 &mov ("ecx",16); 2077 &mov ($inp,"esp"); 2078 &sub ("ecx",$len); 2079 &data_word(0xA4F3F689); # rep movsb 2080 &movdqa (&QWP(0,"esp"),$inout0); 2081 2082 &set_label("cbc_ret"); 2083 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2084 &mov ($key_,&wparam(4)); 2085 &pxor ($inout0,$inout0); 2086 &pxor ($rndkey1,$rndkey1); 2087 &movups (&QWP(0,$key_),$ivec); # output IV 2088 &pxor ($ivec,$ivec); 2089 &set_label("cbc_abort"); 2090 &function_end("${PREFIX}_cbc_encrypt"); 2091 2093 ###################################################################### 2094 # Mechanical port from aesni-x86_64.pl. 2095 # 2096 # _aesni_set_encrypt_key is private interface, 2097 # input: 2098 # "eax" const unsigned char *userKey 2099 # $rounds int bits 2100 # $key AES_KEY *key 2101 # output: 2102 # "eax" return code 2103 # $round rounds 2104 2105 &function_begin_B("_aesni_set_encrypt_key"); 2106 &push ("ebp"); 2107 &push ("ebx"); 2108 &test ("eax","eax"); 2109 &jz (&label("bad_pointer")); 2110 &test ($key,$key); 2111 &jz (&label("bad_pointer")); 2112 2113 &call (&label("pic")); 2114 &set_label("pic"); 2115 &blindpop("ebx"); 2116 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2117 2118 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2119 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2120 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2121 &mov ("ebp",&DWP(4,"ebp")); 2122 &lea ($key,&DWP(16,$key)); 2123 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 2124 &cmp ($rounds,256); 2125 &je (&label("14rounds")); 2126 &cmp ($rounds,192); 2127 &je (&label("12rounds")); 2128 &cmp ($rounds,128); 2129 &jne (&label("bad_keybits")); 2130 2131 &set_label("10rounds",16); 2132 &cmp ("ebp",1<<28); 2133 &je (&label("10rounds_alt")); 2134 2135 &mov ($rounds,9); 2136 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2137 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 2138 &call (&label("key_128_cold")); 2139 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 2140 &call (&label("key_128")); 2141 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 2142 &call (&label("key_128")); 2143 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 2144 &call (&label("key_128")); 2145 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 2146 &call (&label("key_128")); 2147 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 2148 &call (&label("key_128")); 2149 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 2150 &call (&label("key_128")); 2151 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2152 &call (&label("key_128")); 2153 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2154 &call (&label("key_128")); 2155 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2156 &call (&label("key_128")); 2157 &$movekey (&QWP(0,$key),"xmm0"); 2158 &mov (&DWP(80,$key),$rounds); 2159 2160 &jmp (&label("good_key")); 2161 2162 &set_label("key_128",16); 2163 &$movekey (&QWP(0,$key),"xmm0"); 2164 &lea ($key,&DWP(16,$key)); 2165 &set_label("key_128_cold"); 2166 &shufps ("xmm4","xmm0",0b00010000); 2167 &xorps ("xmm0","xmm4"); 2168 &shufps ("xmm4","xmm0",0b10001100); 2169 &xorps ("xmm0","xmm4"); 2170 &shufps ("xmm1","xmm1",0b11111111); # critical path 2171 &xorps ("xmm0","xmm1"); 2172 &ret(); 2173 2174 &set_label("10rounds_alt",16); 2175 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2176 &mov ($rounds,8); 2177 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2178 &movdqa ("xmm2","xmm0"); 2179 &movdqu (&QWP(-16,$key),"xmm0"); 2180 2181 &set_label("loop_key128"); 2182 &pshufb ("xmm0","xmm5"); 2183 &aesenclast ("xmm0","xmm4"); 2184 &pslld ("xmm4",1); 2185 &lea ($key,&DWP(16,$key)); 2186 2187 &movdqa ("xmm3","xmm2"); 2188 &pslldq ("xmm2",4); 2189 &pxor ("xmm3","xmm2"); 2190 &pslldq ("xmm2",4); 2191 &pxor ("xmm3","xmm2"); 2192 &pslldq ("xmm2",4); 2193 &pxor ("xmm2","xmm3"); 2194 2195 &pxor ("xmm0","xmm2"); 2196 &movdqu (&QWP(-16,$key),"xmm0"); 2197 &movdqa ("xmm2","xmm0"); 2198 2199 &dec ($rounds); 2200 &jnz (&label("loop_key128")); 2201 2202 &movdqa ("xmm4",&QWP(0x30,"ebx")); 2203 2204 &pshufb ("xmm0","xmm5"); 2205 &aesenclast ("xmm0","xmm4"); 2206 &pslld ("xmm4",1); 2207 2208 &movdqa ("xmm3","xmm2"); 2209 &pslldq ("xmm2",4); 2210 &pxor ("xmm3","xmm2"); 2211 &pslldq ("xmm2",4); 2212 &pxor ("xmm3","xmm2"); 2213 &pslldq ("xmm2",4); 2214 &pxor ("xmm2","xmm3"); 2215 2216 &pxor ("xmm0","xmm2"); 2217 &movdqu (&QWP(0,$key),"xmm0"); 2218 2219 &movdqa ("xmm2","xmm0"); 2220 &pshufb ("xmm0","xmm5"); 2221 &aesenclast ("xmm0","xmm4"); 2222 2223 &movdqa ("xmm3","xmm2"); 2224 &pslldq ("xmm2",4); 2225 &pxor ("xmm3","xmm2"); 2226 &pslldq ("xmm2",4); 2227 &pxor ("xmm3","xmm2"); 2228 &pslldq ("xmm2",4); 2229 &pxor ("xmm2","xmm3"); 2230 2231 &pxor ("xmm0","xmm2"); 2232 &movdqu (&QWP(16,$key),"xmm0"); 2233 2234 &mov ($rounds,9); 2235 &mov (&DWP(96,$key),$rounds); 2236 2237 &jmp (&label("good_key")); 2238 2239 &set_label("12rounds",16); 2240 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2241 &cmp ("ebp",1<<28); 2242 &je (&label("12rounds_alt")); 2243 2244 &mov ($rounds,11); 2245 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2246 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2247 &call (&label("key_192a_cold")); 2248 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2249 &call (&label("key_192b")); 2250 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2251 &call (&label("key_192a")); 2252 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2253 &call (&label("key_192b")); 2254 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2255 &call (&label("key_192a")); 2256 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2257 &call (&label("key_192b")); 2258 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2259 &call (&label("key_192a")); 2260 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2261 &call (&label("key_192b")); 2262 &$movekey (&QWP(0,$key),"xmm0"); 2263 &mov (&DWP(48,$key),$rounds); 2264 2265 &jmp (&label("good_key")); 2266 2267 &set_label("key_192a",16); 2268 &$movekey (&QWP(0,$key),"xmm0"); 2269 &lea ($key,&DWP(16,$key)); 2270 &set_label("key_192a_cold",16); 2271 &movaps ("xmm5","xmm2"); 2272 &set_label("key_192b_warm"); 2273 &shufps ("xmm4","xmm0",0b00010000); 2274 &movdqa ("xmm3","xmm2"); 2275 &xorps ("xmm0","xmm4"); 2276 &shufps ("xmm4","xmm0",0b10001100); 2277 &pslldq ("xmm3",4); 2278 &xorps ("xmm0","xmm4"); 2279 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2280 &pxor ("xmm2","xmm3"); 2281 &pxor ("xmm0","xmm1"); 2282 &pshufd ("xmm3","xmm0",0b11111111); 2283 &pxor ("xmm2","xmm3"); 2284 &ret(); 2285 2286 &set_label("key_192b",16); 2287 &movaps ("xmm3","xmm0"); 2288 &shufps ("xmm5","xmm0",0b01000100); 2289 &$movekey (&QWP(0,$key),"xmm5"); 2290 &shufps ("xmm3","xmm2",0b01001110); 2291 &$movekey (&QWP(16,$key),"xmm3"); 2292 &lea ($key,&DWP(32,$key)); 2293 &jmp (&label("key_192b_warm")); 2294 2295 &set_label("12rounds_alt",16); 2296 &movdqa ("xmm5",&QWP(0x10,"ebx")); 2297 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2298 &mov ($rounds,8); 2299 &movdqu (&QWP(-16,$key),"xmm0"); 2300 2301 &set_label("loop_key192"); 2302 &movq (&QWP(0,$key),"xmm2"); 2303 &movdqa ("xmm1","xmm2"); 2304 &pshufb ("xmm2","xmm5"); 2305 &aesenclast ("xmm2","xmm4"); 2306 &pslld ("xmm4",1); 2307 &lea ($key,&DWP(24,$key)); 2308 2309 &movdqa ("xmm3","xmm0"); 2310 &pslldq ("xmm0",4); 2311 &pxor ("xmm3","xmm0"); 2312 &pslldq ("xmm0",4); 2313 &pxor ("xmm3","xmm0"); 2314 &pslldq ("xmm0",4); 2315 &pxor ("xmm0","xmm3"); 2316 2317 &pshufd ("xmm3","xmm0",0xff); 2318 &pxor ("xmm3","xmm1"); 2319 &pslldq ("xmm1",4); 2320 &pxor ("xmm3","xmm1"); 2321 2322 &pxor ("xmm0","xmm2"); 2323 &pxor ("xmm2","xmm3"); 2324 &movdqu (&QWP(-16,$key),"xmm0"); 2325 2326 &dec ($rounds); 2327 &jnz (&label("loop_key192")); 2328 2329 &mov ($rounds,11); 2330 &mov (&DWP(32,$key),$rounds); 2331 2332 &jmp (&label("good_key")); 2333 2334 &set_label("14rounds",16); 2335 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2336 &lea ($key,&DWP(16,$key)); 2337 &cmp ("ebp",1<<28); 2338 &je (&label("14rounds_alt")); 2339 2340 &mov ($rounds,13); 2341 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2342 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2343 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2344 &call (&label("key_256a_cold")); 2345 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2346 &call (&label("key_256b")); 2347 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2348 &call (&label("key_256a")); 2349 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2350 &call (&label("key_256b")); 2351 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2352 &call (&label("key_256a")); 2353 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2354 &call (&label("key_256b")); 2355 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2356 &call (&label("key_256a")); 2357 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2358 &call (&label("key_256b")); 2359 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2360 &call (&label("key_256a")); 2361 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2362 &call (&label("key_256b")); 2363 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2364 &call (&label("key_256a")); 2365 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2366 &call (&label("key_256b")); 2367 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2368 &call (&label("key_256a")); 2369 &$movekey (&QWP(0,$key),"xmm0"); 2370 &mov (&DWP(16,$key),$rounds); 2371 &xor ("eax","eax"); 2372 2373 &jmp (&label("good_key")); 2374 2375 &set_label("key_256a",16); 2376 &$movekey (&QWP(0,$key),"xmm2"); 2377 &lea ($key,&DWP(16,$key)); 2378 &set_label("key_256a_cold"); 2379 &shufps ("xmm4","xmm0",0b00010000); 2380 &xorps ("xmm0","xmm4"); 2381 &shufps ("xmm4","xmm0",0b10001100); 2382 &xorps ("xmm0","xmm4"); 2383 &shufps ("xmm1","xmm1",0b11111111); # critical path 2384 &xorps ("xmm0","xmm1"); 2385 &ret(); 2386 2387 &set_label("key_256b",16); 2388 &$movekey (&QWP(0,$key),"xmm0"); 2389 &lea ($key,&DWP(16,$key)); 2390 2391 &shufps ("xmm4","xmm2",0b00010000); 2392 &xorps ("xmm2","xmm4"); 2393 &shufps ("xmm4","xmm2",0b10001100); 2394 &xorps ("xmm2","xmm4"); 2395 &shufps ("xmm1","xmm1",0b10101010); # critical path 2396 &xorps ("xmm2","xmm1"); 2397 &ret(); 2398 2399 &set_label("14rounds_alt",16); 2400 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2401 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2402 &mov ($rounds,7); 2403 &movdqu (&QWP(-32,$key),"xmm0"); 2404 &movdqa ("xmm1","xmm2"); 2405 &movdqu (&QWP(-16,$key),"xmm2"); 2406 2407 &set_label("loop_key256"); 2408 &pshufb ("xmm2","xmm5"); 2409 &aesenclast ("xmm2","xmm4"); 2410 2411 &movdqa ("xmm3","xmm0"); 2412 &pslldq ("xmm0",4); 2413 &pxor ("xmm3","xmm0"); 2414 &pslldq ("xmm0",4); 2415 &pxor ("xmm3","xmm0"); 2416 &pslldq ("xmm0",4); 2417 &pxor ("xmm0","xmm3"); 2418 &pslld ("xmm4",1); 2419 2420 &pxor ("xmm0","xmm2"); 2421 &movdqu (&QWP(0,$key),"xmm0"); 2422 2423 &dec ($rounds); 2424 &jz (&label("done_key256")); 2425 2426 &pshufd ("xmm2","xmm0",0xff); 2427 &pxor ("xmm3","xmm3"); 2428 &aesenclast ("xmm2","xmm3"); 2429 2430 &movdqa ("xmm3","xmm1") 2431 &pslldq ("xmm1",4); 2432 &pxor ("xmm3","xmm1"); 2433 &pslldq ("xmm1",4); 2434 &pxor ("xmm3","xmm1"); 2435 &pslldq ("xmm1",4); 2436 &pxor ("xmm1","xmm3"); 2437 2438 &pxor ("xmm2","xmm1"); 2439 &movdqu (&QWP(16,$key),"xmm2"); 2440 &lea ($key,&DWP(32,$key)); 2441 &movdqa ("xmm1","xmm2"); 2442 &jmp (&label("loop_key256")); 2443 2444 &set_label("done_key256"); 2445 &mov ($rounds,13); 2446 &mov (&DWP(16,$key),$rounds); 2447 2448 &set_label("good_key"); 2449 &pxor ("xmm0","xmm0"); 2450 &pxor ("xmm1","xmm1"); 2451 &pxor ("xmm2","xmm2"); 2452 &pxor ("xmm3","xmm3"); 2453 &pxor ("xmm4","xmm4"); 2454 &pxor ("xmm5","xmm5"); 2455 &xor ("eax","eax"); 2456 &pop ("ebx"); 2457 &pop ("ebp"); 2458 &ret (); 2459 2460 &set_label("bad_pointer",4); 2461 &mov ("eax",-1); 2462 &pop ("ebx"); 2463 &pop ("ebp"); 2464 &ret (); 2465 &set_label("bad_keybits",4); 2466 &pxor ("xmm0","xmm0"); 2467 &mov ("eax",-2); 2468 &pop ("ebx"); 2469 &pop ("ebp"); 2470 &ret (); 2471 &function_end_B("_aesni_set_encrypt_key"); 2472 2473 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2474 # AES_KEY *key) 2475 &function_begin_B("${PREFIX}_set_encrypt_key"); 2476 &mov ("eax",&wparam(0)); 2477 &mov ($rounds,&wparam(1)); 2478 &mov ($key,&wparam(2)); 2479 &call ("_aesni_set_encrypt_key"); 2480 &ret (); 2481 &function_end_B("${PREFIX}_set_encrypt_key"); 2482 2483 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2484 # AES_KEY *key) 2485 &function_begin_B("${PREFIX}_set_decrypt_key"); 2486 &mov ("eax",&wparam(0)); 2487 &mov ($rounds,&wparam(1)); 2488 &mov ($key,&wparam(2)); 2489 &call ("_aesni_set_encrypt_key"); 2490 &mov ($key,&wparam(2)); 2491 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 2492 &test ("eax","eax"); 2493 &jnz (&label("dec_key_ret")); 2494 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2495 2496 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2497 &$movekey ("xmm1",&QWP(0,"eax")); 2498 &$movekey (&QWP(0,"eax"),"xmm0"); 2499 &$movekey (&QWP(0,$key),"xmm1"); 2500 &lea ($key,&DWP(16,$key)); 2501 &lea ("eax",&DWP(-16,"eax")); 2502 2503 &set_label("dec_key_inverse"); 2504 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2505 &$movekey ("xmm1",&QWP(0,"eax")); 2506 &aesimc ("xmm0","xmm0"); 2507 &aesimc ("xmm1","xmm1"); 2508 &lea ($key,&DWP(16,$key)); 2509 &lea ("eax",&DWP(-16,"eax")); 2510 &$movekey (&QWP(16,"eax"),"xmm0"); 2511 &$movekey (&QWP(-16,$key),"xmm1"); 2512 &cmp ("eax",$key); 2513 &ja (&label("dec_key_inverse")); 2514 2515 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2516 &aesimc ("xmm0","xmm0"); 2517 &$movekey (&QWP(0,$key),"xmm0"); 2518 2519 &pxor ("xmm0","xmm0"); 2520 &pxor ("xmm1","xmm1"); 2521 &xor ("eax","eax"); # return success 2522 &set_label("dec_key_ret"); 2523 &ret (); 2524 &function_end_B("${PREFIX}_set_decrypt_key"); 2525 2526 &set_label("key_const",64); 2527 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 2528 &data_word(0x04070605,0x04070605,0x04070605,0x04070605); 2529 &data_word(1,1,1,1); 2530 &data_word(0x1b,0x1b,0x1b,0x1b); 2531 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2532 2533 &asm_finish(); 2534