1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # To start with see corresponding paragraph in aesni-x86_64.pl... 18 # Instead of filling table similar to one found there I've chosen to 19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20 # The simplified table below represents 32-bit performance relative 21 # to 64-bit one in every given point. Ratios vary for different 22 # encryption modes, therefore interval values. 23 # 24 # 16-byte 64-byte 256-byte 1-KB 8-KB 25 # 53-67% 67-84% 91-94% 95-98% 97-99.5% 26 # 27 # Lower ratios for smaller block sizes are perfectly understandable, 28 # because function call overhead is higher in 32-bit mode. Largest 29 # 8-KB block performance is virtually same: 32-bit code is less than 30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32 # January 2011 33 # 34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module 35 # interleaves at most 6 aes[enc|dec] instructions, because there are 36 # not enough registers for 8x interleave [which should be optimal for 37 # Sandy Bridge]. Actually, performance results for 6x interleave 38 # factor presented in aesni-x86_64.pl (except for CTR) are for this 39 # module. 40 41 # April 2011 42 # 43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46 ###################################################################### 47 # Current large-block performance in cycles per byte processed with 48 # 128-bit key (less is better). 49 # 50 # CBC en-/decrypt CTR XTS ECB 51 # Westmere 3.77/1.37 1.37 1.52 1.27 52 # * Bridge 5.07/0.98 0.99 1.09 0.91 53 # Haswell 4.44/0.80 0.97 1.03 0.72 54 # Atom 5.77/3.56 3.67 4.03 3.46 55 # Bulldozer 5.80/0.98 1.05 1.24 0.93 56 57 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 58 # generates drop-in replacement for 59 # crypto/aes/asm/aes-586.pl:-) 60 $inline=1; # inline _aesni_[en|de]crypt 61 62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63 push(@INC,"${dir}","${dir}../../perlasm"); 64 require "x86asm.pl"; 65 66 &asm_init($ARGV[0],$0); 67 68 if ($PREFIX eq "aesni") { $movekey=\&movups; } 69 else { $movekey=\&movups; } 70 71 $len="eax"; 72 $rounds="ecx"; 73 $key="edx"; 74 $inp="esi"; 75 $out="edi"; 76 $rounds_="ebx"; # backup copy for $rounds 77 $key_="ebp"; # backup copy for $key 78 79 $rndkey0="xmm0"; 80 $rndkey1="xmm1"; 81 $inout0="xmm2"; 82 $inout1="xmm3"; 83 $inout2="xmm4"; 84 $inout3="xmm5"; $in1="xmm5"; 85 $inout4="xmm6"; $in0="xmm6"; 86 $inout5="xmm7"; $ivec="xmm7"; 87 88 # AESNI extenstion 89 sub aeskeygenassist 90 { my($dst,$src,$imm)=@_; 91 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 92 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 93 } 94 sub aescommon 95 { my($opcodelet,$dst,$src)=@_; 96 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 97 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 98 } 99 sub aesimc { aescommon(0xdb,@_); } 100 sub aesenc { aescommon(0xdc,@_); } 101 sub aesenclast { aescommon(0xdd,@_); } 102 sub aesdec { aescommon(0xde,@_); } 103 sub aesdeclast { aescommon(0xdf,@_); } 104 106 # Inline version of internal aesni_[en|de]crypt1 107 { my $sn; 108 sub aesni_inline_generate1 109 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 110 $sn++; 111 112 &$movekey ($rndkey0,&QWP(0,$key)); 113 &$movekey ($rndkey1,&QWP(16,$key)); 114 &xorps ($ivec,$rndkey0) if (defined($ivec)); 115 &lea ($key,&DWP(32,$key)); 116 &xorps ($inout,$ivec) if (defined($ivec)); 117 &xorps ($inout,$rndkey0) if (!defined($ivec)); 118 &set_label("${p}1_loop_$sn"); 119 eval"&aes${p} ($inout,$rndkey1)"; 120 &dec ($rounds); 121 &$movekey ($rndkey1,&QWP(0,$key)); 122 &lea ($key,&DWP(16,$key)); 123 &jnz (&label("${p}1_loop_$sn")); 124 eval"&aes${p}last ($inout,$rndkey1)"; 125 }} 126 127 sub aesni_generate1 # fully unrolled loop 128 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 129 130 &function_begin_B("_aesni_${p}rypt1"); 131 &movups ($rndkey0,&QWP(0,$key)); 132 &$movekey ($rndkey1,&QWP(0x10,$key)); 133 &xorps ($inout,$rndkey0); 134 &$movekey ($rndkey0,&QWP(0x20,$key)); 135 &lea ($key,&DWP(0x30,$key)); 136 &cmp ($rounds,11); 137 &jb (&label("${p}128")); 138 &lea ($key,&DWP(0x20,$key)); 139 &je (&label("${p}192")); 140 &lea ($key,&DWP(0x20,$key)); 141 eval"&aes${p} ($inout,$rndkey1)"; 142 &$movekey ($rndkey1,&QWP(-0x40,$key)); 143 eval"&aes${p} ($inout,$rndkey0)"; 144 &$movekey ($rndkey0,&QWP(-0x30,$key)); 145 &set_label("${p}192"); 146 eval"&aes${p} ($inout,$rndkey1)"; 147 &$movekey ($rndkey1,&QWP(-0x20,$key)); 148 eval"&aes${p} ($inout,$rndkey0)"; 149 &$movekey ($rndkey0,&QWP(-0x10,$key)); 150 &set_label("${p}128"); 151 eval"&aes${p} ($inout,$rndkey1)"; 152 &$movekey ($rndkey1,&QWP(0,$key)); 153 eval"&aes${p} ($inout,$rndkey0)"; 154 &$movekey ($rndkey0,&QWP(0x10,$key)); 155 eval"&aes${p} ($inout,$rndkey1)"; 156 &$movekey ($rndkey1,&QWP(0x20,$key)); 157 eval"&aes${p} ($inout,$rndkey0)"; 158 &$movekey ($rndkey0,&QWP(0x30,$key)); 159 eval"&aes${p} ($inout,$rndkey1)"; 160 &$movekey ($rndkey1,&QWP(0x40,$key)); 161 eval"&aes${p} ($inout,$rndkey0)"; 162 &$movekey ($rndkey0,&QWP(0x50,$key)); 163 eval"&aes${p} ($inout,$rndkey1)"; 164 &$movekey ($rndkey1,&QWP(0x60,$key)); 165 eval"&aes${p} ($inout,$rndkey0)"; 166 &$movekey ($rndkey0,&QWP(0x70,$key)); 167 eval"&aes${p} ($inout,$rndkey1)"; 168 eval"&aes${p}last ($inout,$rndkey0)"; 169 &ret(); 170 &function_end_B("_aesni_${p}rypt1"); 171 } 172 174 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 175 &aesni_generate1("enc") if (!$inline); 176 &function_begin_B("${PREFIX}_encrypt"); 177 &mov ("eax",&wparam(0)); 178 &mov ($key,&wparam(2)); 179 &movups ($inout0,&QWP(0,"eax")); 180 &mov ($rounds,&DWP(240,$key)); 181 &mov ("eax",&wparam(1)); 182 if ($inline) 183 { &aesni_inline_generate1("enc"); } 184 else 185 { &call ("_aesni_encrypt1"); } 186 &movups (&QWP(0,"eax"),$inout0); 187 &ret (); 188 &function_end_B("${PREFIX}_encrypt"); 189 190 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 191 &aesni_generate1("dec") if(!$inline); 192 &function_begin_B("${PREFIX}_decrypt"); 193 &mov ("eax",&wparam(0)); 194 &mov ($key,&wparam(2)); 195 &movups ($inout0,&QWP(0,"eax")); 196 &mov ($rounds,&DWP(240,$key)); 197 &mov ("eax",&wparam(1)); 198 if ($inline) 199 { &aesni_inline_generate1("dec"); } 200 else 201 { &call ("_aesni_decrypt1"); } 202 &movups (&QWP(0,"eax"),$inout0); 203 &ret (); 204 &function_end_B("${PREFIX}_decrypt"); 205 206 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 207 # factor. Why 3x subroutine were originally used in loops? Even though 208 # aes[enc|dec] latency was originally 6, it could be scheduled only 209 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 210 # utilization, i.e. when subroutine's throughput is virtually same as 211 # of non-interleaved subroutine [for number of input blocks up to 3]. 212 # This is why it originally made no sense to implement 2x subroutine. 213 # But times change and it became appropriate to spend extra 192 bytes 214 # on 2x subroutine on Atom Silvermont account. For processors that 215 # can schedule aes[enc|dec] every cycle optimal interleave factor 216 # equals to corresponding instructions latency. 8x is optimal for 217 # * Bridge, but it's unfeasible to accommodate such implementation 218 # in XMM registers addreassable in 32-bit mode and therefore maximum 219 # of 6x is used instead... 220 221 sub aesni_generate2 222 { my $p=shift; 223 224 &function_begin_B("_aesni_${p}rypt2"); 225 &$movekey ($rndkey0,&QWP(0,$key)); 226 &shl ($rounds,4); 227 &$movekey ($rndkey1,&QWP(16,$key)); 228 &xorps ($inout0,$rndkey0); 229 &pxor ($inout1,$rndkey0); 230 &$movekey ($rndkey0,&QWP(32,$key)); 231 &lea ($key,&DWP(32,$key,$rounds)); 232 &neg ($rounds); 233 &add ($rounds,16); 234 235 &set_label("${p}2_loop"); 236 eval"&aes${p} ($inout0,$rndkey1)"; 237 eval"&aes${p} ($inout1,$rndkey1)"; 238 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 239 &add ($rounds,32); 240 eval"&aes${p} ($inout0,$rndkey0)"; 241 eval"&aes${p} ($inout1,$rndkey0)"; 242 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 243 &jnz (&label("${p}2_loop")); 244 eval"&aes${p} ($inout0,$rndkey1)"; 245 eval"&aes${p} ($inout1,$rndkey1)"; 246 eval"&aes${p}last ($inout0,$rndkey0)"; 247 eval"&aes${p}last ($inout1,$rndkey0)"; 248 &ret(); 249 &function_end_B("_aesni_${p}rypt2"); 250 } 251 252 sub aesni_generate3 253 { my $p=shift; 254 255 &function_begin_B("_aesni_${p}rypt3"); 256 &$movekey ($rndkey0,&QWP(0,$key)); 257 &shl ($rounds,4); 258 &$movekey ($rndkey1,&QWP(16,$key)); 259 &xorps ($inout0,$rndkey0); 260 &pxor ($inout1,$rndkey0); 261 &pxor ($inout2,$rndkey0); 262 &$movekey ($rndkey0,&QWP(32,$key)); 263 &lea ($key,&DWP(32,$key,$rounds)); 264 &neg ($rounds); 265 &add ($rounds,16); 266 267 &set_label("${p}3_loop"); 268 eval"&aes${p} ($inout0,$rndkey1)"; 269 eval"&aes${p} ($inout1,$rndkey1)"; 270 eval"&aes${p} ($inout2,$rndkey1)"; 271 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 272 &add ($rounds,32); 273 eval"&aes${p} ($inout0,$rndkey0)"; 274 eval"&aes${p} ($inout1,$rndkey0)"; 275 eval"&aes${p} ($inout2,$rndkey0)"; 276 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 277 &jnz (&label("${p}3_loop")); 278 eval"&aes${p} ($inout0,$rndkey1)"; 279 eval"&aes${p} ($inout1,$rndkey1)"; 280 eval"&aes${p} ($inout2,$rndkey1)"; 281 eval"&aes${p}last ($inout0,$rndkey0)"; 282 eval"&aes${p}last ($inout1,$rndkey0)"; 283 eval"&aes${p}last ($inout2,$rndkey0)"; 284 &ret(); 285 &function_end_B("_aesni_${p}rypt3"); 286 } 287 288 # 4x interleave is implemented to improve small block performance, 289 # most notably [and naturally] 4 block by ~30%. One can argue that one 290 # should have implemented 5x as well, but improvement would be <20%, 291 # so it's not worth it... 292 sub aesni_generate4 293 { my $p=shift; 294 295 &function_begin_B("_aesni_${p}rypt4"); 296 &$movekey ($rndkey0,&QWP(0,$key)); 297 &$movekey ($rndkey1,&QWP(16,$key)); 298 &shl ($rounds,4); 299 &xorps ($inout0,$rndkey0); 300 &pxor ($inout1,$rndkey0); 301 &pxor ($inout2,$rndkey0); 302 &pxor ($inout3,$rndkey0); 303 &$movekey ($rndkey0,&QWP(32,$key)); 304 &lea ($key,&DWP(32,$key,$rounds)); 305 &neg ($rounds); 306 &data_byte (0x0f,0x1f,0x40,0x00); 307 &add ($rounds,16); 308 309 &set_label("${p}4_loop"); 310 eval"&aes${p} ($inout0,$rndkey1)"; 311 eval"&aes${p} ($inout1,$rndkey1)"; 312 eval"&aes${p} ($inout2,$rndkey1)"; 313 eval"&aes${p} ($inout3,$rndkey1)"; 314 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 315 &add ($rounds,32); 316 eval"&aes${p} ($inout0,$rndkey0)"; 317 eval"&aes${p} ($inout1,$rndkey0)"; 318 eval"&aes${p} ($inout2,$rndkey0)"; 319 eval"&aes${p} ($inout3,$rndkey0)"; 320 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 321 &jnz (&label("${p}4_loop")); 322 323 eval"&aes${p} ($inout0,$rndkey1)"; 324 eval"&aes${p} ($inout1,$rndkey1)"; 325 eval"&aes${p} ($inout2,$rndkey1)"; 326 eval"&aes${p} ($inout3,$rndkey1)"; 327 eval"&aes${p}last ($inout0,$rndkey0)"; 328 eval"&aes${p}last ($inout1,$rndkey0)"; 329 eval"&aes${p}last ($inout2,$rndkey0)"; 330 eval"&aes${p}last ($inout3,$rndkey0)"; 331 &ret(); 332 &function_end_B("_aesni_${p}rypt4"); 333 } 334 335 sub aesni_generate6 336 { my $p=shift; 337 338 &function_begin_B("_aesni_${p}rypt6"); 339 &static_label("_aesni_${p}rypt6_enter"); 340 &$movekey ($rndkey0,&QWP(0,$key)); 341 &shl ($rounds,4); 342 &$movekey ($rndkey1,&QWP(16,$key)); 343 &xorps ($inout0,$rndkey0); 344 &pxor ($inout1,$rndkey0); # pxor does better here 345 &pxor ($inout2,$rndkey0); 346 eval"&aes${p} ($inout0,$rndkey1)"; 347 &pxor ($inout3,$rndkey0); 348 &pxor ($inout4,$rndkey0); 349 eval"&aes${p} ($inout1,$rndkey1)"; 350 &lea ($key,&DWP(32,$key,$rounds)); 351 &neg ($rounds); 352 eval"&aes${p} ($inout2,$rndkey1)"; 353 &pxor ($inout5,$rndkey0); 354 &add ($rounds,16); 355 eval"&aes${p} ($inout3,$rndkey1)"; 356 eval"&aes${p} ($inout4,$rndkey1)"; 357 eval"&aes${p} ($inout5,$rndkey1)"; 358 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 359 &jmp (&label("_aesni_${p}rypt6_enter")); 360 361 &set_label("${p}6_loop",16); 362 eval"&aes${p} ($inout0,$rndkey1)"; 363 eval"&aes${p} ($inout1,$rndkey1)"; 364 eval"&aes${p} ($inout2,$rndkey1)"; 365 eval"&aes${p} ($inout3,$rndkey1)"; 366 eval"&aes${p} ($inout4,$rndkey1)"; 367 eval"&aes${p} ($inout5,$rndkey1)"; 368 &set_label("_aesni_${p}rypt6_enter"); 369 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 370 &add ($rounds,32); 371 eval"&aes${p} ($inout0,$rndkey0)"; 372 eval"&aes${p} ($inout1,$rndkey0)"; 373 eval"&aes${p} ($inout2,$rndkey0)"; 374 eval"&aes${p} ($inout3,$rndkey0)"; 375 eval"&aes${p} ($inout4,$rndkey0)"; 376 eval"&aes${p} ($inout5,$rndkey0)"; 377 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 378 &jnz (&label("${p}6_loop")); 379 380 eval"&aes${p} ($inout0,$rndkey1)"; 381 eval"&aes${p} ($inout1,$rndkey1)"; 382 eval"&aes${p} ($inout2,$rndkey1)"; 383 eval"&aes${p} ($inout3,$rndkey1)"; 384 eval"&aes${p} ($inout4,$rndkey1)"; 385 eval"&aes${p} ($inout5,$rndkey1)"; 386 eval"&aes${p}last ($inout0,$rndkey0)"; 387 eval"&aes${p}last ($inout1,$rndkey0)"; 388 eval"&aes${p}last ($inout2,$rndkey0)"; 389 eval"&aes${p}last ($inout3,$rndkey0)"; 390 eval"&aes${p}last ($inout4,$rndkey0)"; 391 eval"&aes${p}last ($inout5,$rndkey0)"; 392 &ret(); 393 &function_end_B("_aesni_${p}rypt6"); 394 } 395 &aesni_generate2("enc") if ($PREFIX eq "aesni"); 396 &aesni_generate2("dec"); 397 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 398 &aesni_generate3("dec"); 399 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 400 &aesni_generate4("dec"); 401 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 402 &aesni_generate6("dec"); 403 405 if ($PREFIX eq "aesni") { 406 ###################################################################### 407 # void aesni_ecb_encrypt (const void *in, void *out, 408 # size_t length, const AES_KEY *key, 409 # int enc); 410 &function_begin("aesni_ecb_encrypt"); 411 &mov ($inp,&wparam(0)); 412 &mov ($out,&wparam(1)); 413 &mov ($len,&wparam(2)); 414 &mov ($key,&wparam(3)); 415 &mov ($rounds_,&wparam(4)); 416 &and ($len,-16); 417 &jz (&label("ecb_ret")); 418 &mov ($rounds,&DWP(240,$key)); 419 &test ($rounds_,$rounds_); 420 &jz (&label("ecb_decrypt")); 421 422 &mov ($key_,$key); # backup $key 423 &mov ($rounds_,$rounds); # backup $rounds 424 &cmp ($len,0x60); 425 &jb (&label("ecb_enc_tail")); 426 427 &movdqu ($inout0,&QWP(0,$inp)); 428 &movdqu ($inout1,&QWP(0x10,$inp)); 429 &movdqu ($inout2,&QWP(0x20,$inp)); 430 &movdqu ($inout3,&QWP(0x30,$inp)); 431 &movdqu ($inout4,&QWP(0x40,$inp)); 432 &movdqu ($inout5,&QWP(0x50,$inp)); 433 &lea ($inp,&DWP(0x60,$inp)); 434 &sub ($len,0x60); 435 &jmp (&label("ecb_enc_loop6_enter")); 436 437 &set_label("ecb_enc_loop6",16); 438 &movups (&QWP(0,$out),$inout0); 439 &movdqu ($inout0,&QWP(0,$inp)); 440 &movups (&QWP(0x10,$out),$inout1); 441 &movdqu ($inout1,&QWP(0x10,$inp)); 442 &movups (&QWP(0x20,$out),$inout2); 443 &movdqu ($inout2,&QWP(0x20,$inp)); 444 &movups (&QWP(0x30,$out),$inout3); 445 &movdqu ($inout3,&QWP(0x30,$inp)); 446 &movups (&QWP(0x40,$out),$inout4); 447 &movdqu ($inout4,&QWP(0x40,$inp)); 448 &movups (&QWP(0x50,$out),$inout5); 449 &lea ($out,&DWP(0x60,$out)); 450 &movdqu ($inout5,&QWP(0x50,$inp)); 451 &lea ($inp,&DWP(0x60,$inp)); 452 &set_label("ecb_enc_loop6_enter"); 453 454 &call ("_aesni_encrypt6"); 455 456 &mov ($key,$key_); # restore $key 457 &mov ($rounds,$rounds_); # restore $rounds 458 &sub ($len,0x60); 459 &jnc (&label("ecb_enc_loop6")); 460 461 &movups (&QWP(0,$out),$inout0); 462 &movups (&QWP(0x10,$out),$inout1); 463 &movups (&QWP(0x20,$out),$inout2); 464 &movups (&QWP(0x30,$out),$inout3); 465 &movups (&QWP(0x40,$out),$inout4); 466 &movups (&QWP(0x50,$out),$inout5); 467 &lea ($out,&DWP(0x60,$out)); 468 &add ($len,0x60); 469 &jz (&label("ecb_ret")); 470 471 &set_label("ecb_enc_tail"); 472 &movups ($inout0,&QWP(0,$inp)); 473 &cmp ($len,0x20); 474 &jb (&label("ecb_enc_one")); 475 &movups ($inout1,&QWP(0x10,$inp)); 476 &je (&label("ecb_enc_two")); 477 &movups ($inout2,&QWP(0x20,$inp)); 478 &cmp ($len,0x40); 479 &jb (&label("ecb_enc_three")); 480 &movups ($inout3,&QWP(0x30,$inp)); 481 &je (&label("ecb_enc_four")); 482 &movups ($inout4,&QWP(0x40,$inp)); 483 &xorps ($inout5,$inout5); 484 &call ("_aesni_encrypt6"); 485 &movups (&QWP(0,$out),$inout0); 486 &movups (&QWP(0x10,$out),$inout1); 487 &movups (&QWP(0x20,$out),$inout2); 488 &movups (&QWP(0x30,$out),$inout3); 489 &movups (&QWP(0x40,$out),$inout4); 490 jmp (&label("ecb_ret")); 491 492 &set_label("ecb_enc_one",16); 493 if ($inline) 494 { &aesni_inline_generate1("enc"); } 495 else 496 { &call ("_aesni_encrypt1"); } 497 &movups (&QWP(0,$out),$inout0); 498 &jmp (&label("ecb_ret")); 499 500 &set_label("ecb_enc_two",16); 501 &call ("_aesni_encrypt2"); 502 &movups (&QWP(0,$out),$inout0); 503 &movups (&QWP(0x10,$out),$inout1); 504 &jmp (&label("ecb_ret")); 505 506 &set_label("ecb_enc_three",16); 507 &call ("_aesni_encrypt3"); 508 &movups (&QWP(0,$out),$inout0); 509 &movups (&QWP(0x10,$out),$inout1); 510 &movups (&QWP(0x20,$out),$inout2); 511 &jmp (&label("ecb_ret")); 512 513 &set_label("ecb_enc_four",16); 514 &call ("_aesni_encrypt4"); 515 &movups (&QWP(0,$out),$inout0); 516 &movups (&QWP(0x10,$out),$inout1); 517 &movups (&QWP(0x20,$out),$inout2); 518 &movups (&QWP(0x30,$out),$inout3); 519 &jmp (&label("ecb_ret")); 520 ###################################################################### 521 &set_label("ecb_decrypt",16); 522 &mov ($key_,$key); # backup $key 523 &mov ($rounds_,$rounds); # backup $rounds 524 &cmp ($len,0x60); 525 &jb (&label("ecb_dec_tail")); 526 527 &movdqu ($inout0,&QWP(0,$inp)); 528 &movdqu ($inout1,&QWP(0x10,$inp)); 529 &movdqu ($inout2,&QWP(0x20,$inp)); 530 &movdqu ($inout3,&QWP(0x30,$inp)); 531 &movdqu ($inout4,&QWP(0x40,$inp)); 532 &movdqu ($inout5,&QWP(0x50,$inp)); 533 &lea ($inp,&DWP(0x60,$inp)); 534 &sub ($len,0x60); 535 &jmp (&label("ecb_dec_loop6_enter")); 536 537 &set_label("ecb_dec_loop6",16); 538 &movups (&QWP(0,$out),$inout0); 539 &movdqu ($inout0,&QWP(0,$inp)); 540 &movups (&QWP(0x10,$out),$inout1); 541 &movdqu ($inout1,&QWP(0x10,$inp)); 542 &movups (&QWP(0x20,$out),$inout2); 543 &movdqu ($inout2,&QWP(0x20,$inp)); 544 &movups (&QWP(0x30,$out),$inout3); 545 &movdqu ($inout3,&QWP(0x30,$inp)); 546 &movups (&QWP(0x40,$out),$inout4); 547 &movdqu ($inout4,&QWP(0x40,$inp)); 548 &movups (&QWP(0x50,$out),$inout5); 549 &lea ($out,&DWP(0x60,$out)); 550 &movdqu ($inout5,&QWP(0x50,$inp)); 551 &lea ($inp,&DWP(0x60,$inp)); 552 &set_label("ecb_dec_loop6_enter"); 553 554 &call ("_aesni_decrypt6"); 555 556 &mov ($key,$key_); # restore $key 557 &mov ($rounds,$rounds_); # restore $rounds 558 &sub ($len,0x60); 559 &jnc (&label("ecb_dec_loop6")); 560 561 &movups (&QWP(0,$out),$inout0); 562 &movups (&QWP(0x10,$out),$inout1); 563 &movups (&QWP(0x20,$out),$inout2); 564 &movups (&QWP(0x30,$out),$inout3); 565 &movups (&QWP(0x40,$out),$inout4); 566 &movups (&QWP(0x50,$out),$inout5); 567 &lea ($out,&DWP(0x60,$out)); 568 &add ($len,0x60); 569 &jz (&label("ecb_ret")); 570 571 &set_label("ecb_dec_tail"); 572 &movups ($inout0,&QWP(0,$inp)); 573 &cmp ($len,0x20); 574 &jb (&label("ecb_dec_one")); 575 &movups ($inout1,&QWP(0x10,$inp)); 576 &je (&label("ecb_dec_two")); 577 &movups ($inout2,&QWP(0x20,$inp)); 578 &cmp ($len,0x40); 579 &jb (&label("ecb_dec_three")); 580 &movups ($inout3,&QWP(0x30,$inp)); 581 &je (&label("ecb_dec_four")); 582 &movups ($inout4,&QWP(0x40,$inp)); 583 &xorps ($inout5,$inout5); 584 &call ("_aesni_decrypt6"); 585 &movups (&QWP(0,$out),$inout0); 586 &movups (&QWP(0x10,$out),$inout1); 587 &movups (&QWP(0x20,$out),$inout2); 588 &movups (&QWP(0x30,$out),$inout3); 589 &movups (&QWP(0x40,$out),$inout4); 590 &jmp (&label("ecb_ret")); 591 592 &set_label("ecb_dec_one",16); 593 if ($inline) 594 { &aesni_inline_generate1("dec"); } 595 else 596 { &call ("_aesni_decrypt1"); } 597 &movups (&QWP(0,$out),$inout0); 598 &jmp (&label("ecb_ret")); 599 600 &set_label("ecb_dec_two",16); 601 &call ("_aesni_decrypt2"); 602 &movups (&QWP(0,$out),$inout0); 603 &movups (&QWP(0x10,$out),$inout1); 604 &jmp (&label("ecb_ret")); 605 606 &set_label("ecb_dec_three",16); 607 &call ("_aesni_decrypt3"); 608 &movups (&QWP(0,$out),$inout0); 609 &movups (&QWP(0x10,$out),$inout1); 610 &movups (&QWP(0x20,$out),$inout2); 611 &jmp (&label("ecb_ret")); 612 613 &set_label("ecb_dec_four",16); 614 &call ("_aesni_decrypt4"); 615 &movups (&QWP(0,$out),$inout0); 616 &movups (&QWP(0x10,$out),$inout1); 617 &movups (&QWP(0x20,$out),$inout2); 618 &movups (&QWP(0x30,$out),$inout3); 619 620 &set_label("ecb_ret"); 621 &function_end("aesni_ecb_encrypt"); 622 624 ###################################################################### 625 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 626 # size_t blocks, const AES_KEY *key, 627 # const char *ivec,char *cmac); 628 # 629 # Handles only complete blocks, operates on 64-bit counter and 630 # does not update *ivec! Nor does it finalize CMAC value 631 # (see engine/eng_aesni.c for details) 632 # 633 { my $cmac=$inout1; 634 &function_begin("aesni_ccm64_encrypt_blocks"); 635 &mov ($inp,&wparam(0)); 636 &mov ($out,&wparam(1)); 637 &mov ($len,&wparam(2)); 638 &mov ($key,&wparam(3)); 639 &mov ($rounds_,&wparam(4)); 640 &mov ($rounds,&wparam(5)); 641 &mov ($key_,"esp"); 642 &sub ("esp",60); 643 &and ("esp",-16); # align stack 644 &mov (&DWP(48,"esp"),$key_); 645 646 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 647 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 648 &mov ($rounds,&DWP(240,$key)); 649 650 # compose byte-swap control mask for pshufb on stack 651 &mov (&DWP(0,"esp"),0x0c0d0e0f); 652 &mov (&DWP(4,"esp"),0x08090a0b); 653 &mov (&DWP(8,"esp"),0x04050607); 654 &mov (&DWP(12,"esp"),0x00010203); 655 656 # compose counter increment vector on stack 657 &mov ($rounds_,1); 658 &xor ($key_,$key_); 659 &mov (&DWP(16,"esp"),$rounds_); 660 &mov (&DWP(20,"esp"),$key_); 661 &mov (&DWP(24,"esp"),$key_); 662 &mov (&DWP(28,"esp"),$key_); 663 664 &shl ($rounds,4); 665 &mov ($rounds_,16); 666 &lea ($key_,&DWP(0,$key)); 667 &movdqa ($inout3,&QWP(0,"esp")); 668 &movdqa ($inout0,$ivec); 669 &lea ($key,&DWP(32,$key,$rounds)); 670 &sub ($rounds_,$rounds); 671 &pshufb ($ivec,$inout3); 672 673 &set_label("ccm64_enc_outer"); 674 &$movekey ($rndkey0,&QWP(0,$key_)); 675 &mov ($rounds,$rounds_); 676 &movups ($in0,&QWP(0,$inp)); 677 678 &xorps ($inout0,$rndkey0); 679 &$movekey ($rndkey1,&QWP(16,$key_)); 680 &xorps ($rndkey0,$in0); 681 &xorps ($cmac,$rndkey0); # cmac^=inp 682 &$movekey ($rndkey0,&QWP(32,$key_)); 683 684 &set_label("ccm64_enc2_loop"); 685 &aesenc ($inout0,$rndkey1); 686 &aesenc ($cmac,$rndkey1); 687 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 688 &add ($rounds,32); 689 &aesenc ($inout0,$rndkey0); 690 &aesenc ($cmac,$rndkey0); 691 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 692 &jnz (&label("ccm64_enc2_loop")); 693 &aesenc ($inout0,$rndkey1); 694 &aesenc ($cmac,$rndkey1); 695 &paddq ($ivec,&QWP(16,"esp")); 696 &dec ($len); 697 &aesenclast ($inout0,$rndkey0); 698 &aesenclast ($cmac,$rndkey0); 699 700 &lea ($inp,&DWP(16,$inp)); 701 &xorps ($in0,$inout0); # inp^=E(ivec) 702 &movdqa ($inout0,$ivec); 703 &movups (&QWP(0,$out),$in0); # save output 704 &pshufb ($inout0,$inout3); 705 &lea ($out,&DWP(16,$out)); 706 &jnz (&label("ccm64_enc_outer")); 707 708 &mov ("esp",&DWP(48,"esp")); 709 &mov ($out,&wparam(5)); 710 &movups (&QWP(0,$out),$cmac); 711 &function_end("aesni_ccm64_encrypt_blocks"); 712 713 &function_begin("aesni_ccm64_decrypt_blocks"); 714 &mov ($inp,&wparam(0)); 715 &mov ($out,&wparam(1)); 716 &mov ($len,&wparam(2)); 717 &mov ($key,&wparam(3)); 718 &mov ($rounds_,&wparam(4)); 719 &mov ($rounds,&wparam(5)); 720 &mov ($key_,"esp"); 721 &sub ("esp",60); 722 &and ("esp",-16); # align stack 723 &mov (&DWP(48,"esp"),$key_); 724 725 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 726 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 727 &mov ($rounds,&DWP(240,$key)); 728 729 # compose byte-swap control mask for pshufb on stack 730 &mov (&DWP(0,"esp"),0x0c0d0e0f); 731 &mov (&DWP(4,"esp"),0x08090a0b); 732 &mov (&DWP(8,"esp"),0x04050607); 733 &mov (&DWP(12,"esp"),0x00010203); 734 735 # compose counter increment vector on stack 736 &mov ($rounds_,1); 737 &xor ($key_,$key_); 738 &mov (&DWP(16,"esp"),$rounds_); 739 &mov (&DWP(20,"esp"),$key_); 740 &mov (&DWP(24,"esp"),$key_); 741 &mov (&DWP(28,"esp"),$key_); 742 743 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 744 &movdqa ($inout0,$ivec); 745 746 &mov ($key_,$key); 747 &mov ($rounds_,$rounds); 748 749 &pshufb ($ivec,$inout3); 750 if ($inline) 751 { &aesni_inline_generate1("enc"); } 752 else 753 { &call ("_aesni_encrypt1"); } 754 &shl ($rounds_,4); 755 &mov ($rounds,16); 756 &movups ($in0,&QWP(0,$inp)); # load inp 757 &paddq ($ivec,&QWP(16,"esp")); 758 &lea ($inp,&QWP(16,$inp)); 759 &sub ($rounds,$rounds_); 760 &lea ($key,&DWP(32,$key_,$rounds_)); 761 &mov ($rounds_,$rounds); 762 &jmp (&label("ccm64_dec_outer")); 763 764 &set_label("ccm64_dec_outer",16); 765 &xorps ($in0,$inout0); # inp ^= E(ivec) 766 &movdqa ($inout0,$ivec); 767 &movups (&QWP(0,$out),$in0); # save output 768 &lea ($out,&DWP(16,$out)); 769 &pshufb ($inout0,$inout3); 770 771 &sub ($len,1); 772 &jz (&label("ccm64_dec_break")); 773 774 &$movekey ($rndkey0,&QWP(0,$key_)); 775 &mov ($rounds,$rounds_); 776 &$movekey ($rndkey1,&QWP(16,$key_)); 777 &xorps ($in0,$rndkey0); 778 &xorps ($inout0,$rndkey0); 779 &xorps ($cmac,$in0); # cmac^=out 780 &$movekey ($rndkey0,&QWP(32,$key_)); 781 782 &set_label("ccm64_dec2_loop"); 783 &aesenc ($inout0,$rndkey1); 784 &aesenc ($cmac,$rndkey1); 785 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 786 &add ($rounds,32); 787 &aesenc ($inout0,$rndkey0); 788 &aesenc ($cmac,$rndkey0); 789 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 790 &jnz (&label("ccm64_dec2_loop")); 791 &movups ($in0,&QWP(0,$inp)); # load inp 792 &paddq ($ivec,&QWP(16,"esp")); 793 &aesenc ($inout0,$rndkey1); 794 &aesenc ($cmac,$rndkey1); 795 &aesenclast ($inout0,$rndkey0); 796 &aesenclast ($cmac,$rndkey0); 797 &lea ($inp,&QWP(16,$inp)); 798 &jmp (&label("ccm64_dec_outer")); 799 800 &set_label("ccm64_dec_break",16); 801 &mov ($rounds,&DWP(240,$key_)); 802 &mov ($key,$key_); 803 if ($inline) 804 { &aesni_inline_generate1("enc",$cmac,$in0); } 805 else 806 { &call ("_aesni_encrypt1",$cmac); } 807 808 &mov ("esp",&DWP(48,"esp")); 809 &mov ($out,&wparam(5)); 810 &movups (&QWP(0,$out),$cmac); 811 &function_end("aesni_ccm64_decrypt_blocks"); 812 } 813 815 ###################################################################### 816 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 817 # size_t blocks, const AES_KEY *key, 818 # const char *ivec); 819 # 820 # Handles only complete blocks, operates on 32-bit counter and 821 # does not update *ivec! (see crypto/modes/ctr128.c for details) 822 # 823 # stack layout: 824 # 0 pshufb mask 825 # 16 vector addend: 0,6,6,6 826 # 32 counter-less ivec 827 # 48 1st triplet of counter vector 828 # 64 2nd triplet of counter vector 829 # 80 saved %esp 830 831 &function_begin("aesni_ctr32_encrypt_blocks"); 832 &mov ($inp,&wparam(0)); 833 &mov ($out,&wparam(1)); 834 &mov ($len,&wparam(2)); 835 &mov ($key,&wparam(3)); 836 &mov ($rounds_,&wparam(4)); 837 &mov ($key_,"esp"); 838 &sub ("esp",88); 839 &and ("esp",-16); # align stack 840 &mov (&DWP(80,"esp"),$key_); 841 842 &cmp ($len,1); 843 &je (&label("ctr32_one_shortcut")); 844 845 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 846 847 # compose byte-swap control mask for pshufb on stack 848 &mov (&DWP(0,"esp"),0x0c0d0e0f); 849 &mov (&DWP(4,"esp"),0x08090a0b); 850 &mov (&DWP(8,"esp"),0x04050607); 851 &mov (&DWP(12,"esp"),0x00010203); 852 853 # compose counter increment vector on stack 854 &mov ($rounds,6); 855 &xor ($key_,$key_); 856 &mov (&DWP(16,"esp"),$rounds); 857 &mov (&DWP(20,"esp"),$rounds); 858 &mov (&DWP(24,"esp"),$rounds); 859 &mov (&DWP(28,"esp"),$key_); 860 861 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 862 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 863 864 &mov ($rounds,&DWP(240,$key)); # key->rounds 865 866 # compose 2 vectors of 3x32-bit counters 867 &bswap ($rounds_); 868 &pxor ($rndkey0,$rndkey0); 869 &pxor ($rndkey1,$rndkey1); 870 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 871 &pinsrd ($rndkey0,$rounds_,0); 872 &lea ($key_,&DWP(3,$rounds_)); 873 &pinsrd ($rndkey1,$key_,0); 874 &inc ($rounds_); 875 &pinsrd ($rndkey0,$rounds_,1); 876 &inc ($key_); 877 &pinsrd ($rndkey1,$key_,1); 878 &inc ($rounds_); 879 &pinsrd ($rndkey0,$rounds_,2); 880 &inc ($key_); 881 &pinsrd ($rndkey1,$key_,2); 882 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 883 &pshufb ($rndkey0,$inout0); # byte swap 884 &movdqu ($inout4,&QWP(0,$key)); # key[0] 885 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 886 &pshufb ($rndkey1,$inout0); # byte swap 887 888 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 889 &pshufd ($inout1,$rndkey0,2<<6); 890 &cmp ($len,6); 891 &jb (&label("ctr32_tail")); 892 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 893 &shl ($rounds,4); 894 &mov ($rounds_,16); 895 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 896 &mov ($key_,$key); # backup $key 897 &sub ($rounds_,$rounds); # backup twisted $rounds 898 &lea ($key,&DWP(32,$key,$rounds)); 899 &sub ($len,6); 900 &jmp (&label("ctr32_loop6")); 901 902 &set_label("ctr32_loop6",16); 903 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 904 &pshufd ($inout2,$rndkey0,1<<6); 905 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 906 &pshufd ($inout3,$rndkey1,3<<6); 907 &pxor ($inout0,$rndkey0); # merge counter-less ivec 908 &pshufd ($inout4,$rndkey1,2<<6); 909 &pxor ($inout1,$rndkey0); 910 &pshufd ($inout5,$rndkey1,1<<6); 911 &$movekey ($rndkey1,&QWP(16,$key_)); 912 &pxor ($inout2,$rndkey0); 913 &pxor ($inout3,$rndkey0); 914 &aesenc ($inout0,$rndkey1); 915 &pxor ($inout4,$rndkey0); 916 &pxor ($inout5,$rndkey0); 917 &aesenc ($inout1,$rndkey1); 918 &$movekey ($rndkey0,&QWP(32,$key_)); 919 &mov ($rounds,$rounds_); 920 &aesenc ($inout2,$rndkey1); 921 &aesenc ($inout3,$rndkey1); 922 &aesenc ($inout4,$rndkey1); 923 &aesenc ($inout5,$rndkey1); 924 925 &call (&label("_aesni_encrypt6_enter")); 926 927 &movups ($rndkey1,&QWP(0,$inp)); 928 &movups ($rndkey0,&QWP(0x10,$inp)); 929 &xorps ($inout0,$rndkey1); 930 &movups ($rndkey1,&QWP(0x20,$inp)); 931 &xorps ($inout1,$rndkey0); 932 &movups (&QWP(0,$out),$inout0); 933 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 934 &xorps ($inout2,$rndkey1); 935 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 936 &movups (&QWP(0x10,$out),$inout1); 937 &movups (&QWP(0x20,$out),$inout2); 938 939 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 940 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 941 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 942 943 &movups ($inout1,&QWP(0x30,$inp)); 944 &movups ($inout2,&QWP(0x40,$inp)); 945 &xorps ($inout3,$inout1); 946 &movups ($inout1,&QWP(0x50,$inp)); 947 &lea ($inp,&DWP(0x60,$inp)); 948 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 949 &pshufb ($rndkey0,$inout0); # byte swap 950 &xorps ($inout4,$inout2); 951 &movups (&QWP(0x30,$out),$inout3); 952 &xorps ($inout5,$inout1); 953 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 954 &pshufb ($rndkey1,$inout0); # byte swap 955 &movups (&QWP(0x40,$out),$inout4); 956 &pshufd ($inout0,$rndkey0,3<<6); 957 &movups (&QWP(0x50,$out),$inout5); 958 &lea ($out,&DWP(0x60,$out)); 959 960 &pshufd ($inout1,$rndkey0,2<<6); 961 &sub ($len,6); 962 &jnc (&label("ctr32_loop6")); 963 964 &add ($len,6); 965 &jz (&label("ctr32_ret")); 966 &movdqu ($inout5,&QWP(0,$key_)); 967 &mov ($key,$key_); 968 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 969 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 970 971 &set_label("ctr32_tail"); 972 &por ($inout0,$inout5); 973 &cmp ($len,2); 974 &jb (&label("ctr32_one")); 975 976 &pshufd ($inout2,$rndkey0,1<<6); 977 &por ($inout1,$inout5); 978 &je (&label("ctr32_two")); 979 980 &pshufd ($inout3,$rndkey1,3<<6); 981 &por ($inout2,$inout5); 982 &cmp ($len,4); 983 &jb (&label("ctr32_three")); 984 985 &pshufd ($inout4,$rndkey1,2<<6); 986 &por ($inout3,$inout5); 987 &je (&label("ctr32_four")); 988 989 &por ($inout4,$inout5); 990 &call ("_aesni_encrypt6"); 991 &movups ($rndkey1,&QWP(0,$inp)); 992 &movups ($rndkey0,&QWP(0x10,$inp)); 993 &xorps ($inout0,$rndkey1); 994 &movups ($rndkey1,&QWP(0x20,$inp)); 995 &xorps ($inout1,$rndkey0); 996 &movups ($rndkey0,&QWP(0x30,$inp)); 997 &xorps ($inout2,$rndkey1); 998 &movups ($rndkey1,&QWP(0x40,$inp)); 999 &xorps ($inout3,$rndkey0); 1000 &movups (&QWP(0,$out),$inout0); 1001 &xorps ($inout4,$rndkey1); 1002 &movups (&QWP(0x10,$out),$inout1); 1003 &movups (&QWP(0x20,$out),$inout2); 1004 &movups (&QWP(0x30,$out),$inout3); 1005 &movups (&QWP(0x40,$out),$inout4); 1006 &jmp (&label("ctr32_ret")); 1007 1008 &set_label("ctr32_one_shortcut",16); 1009 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1010 &mov ($rounds,&DWP(240,$key)); 1011 1012 &set_label("ctr32_one"); 1013 if ($inline) 1014 { &aesni_inline_generate1("enc"); } 1015 else 1016 { &call ("_aesni_encrypt1"); } 1017 &movups ($in0,&QWP(0,$inp)); 1018 &xorps ($in0,$inout0); 1019 &movups (&QWP(0,$out),$in0); 1020 &jmp (&label("ctr32_ret")); 1021 1022 &set_label("ctr32_two",16); 1023 &call ("_aesni_encrypt2"); 1024 &movups ($inout3,&QWP(0,$inp)); 1025 &movups ($inout4,&QWP(0x10,$inp)); 1026 &xorps ($inout0,$inout3); 1027 &xorps ($inout1,$inout4); 1028 &movups (&QWP(0,$out),$inout0); 1029 &movups (&QWP(0x10,$out),$inout1); 1030 &jmp (&label("ctr32_ret")); 1031 1032 &set_label("ctr32_three",16); 1033 &call ("_aesni_encrypt3"); 1034 &movups ($inout3,&QWP(0,$inp)); 1035 &movups ($inout4,&QWP(0x10,$inp)); 1036 &xorps ($inout0,$inout3); 1037 &movups ($inout5,&QWP(0x20,$inp)); 1038 &xorps ($inout1,$inout4); 1039 &movups (&QWP(0,$out),$inout0); 1040 &xorps ($inout2,$inout5); 1041 &movups (&QWP(0x10,$out),$inout1); 1042 &movups (&QWP(0x20,$out),$inout2); 1043 &jmp (&label("ctr32_ret")); 1044 1045 &set_label("ctr32_four",16); 1046 &call ("_aesni_encrypt4"); 1047 &movups ($inout4,&QWP(0,$inp)); 1048 &movups ($inout5,&QWP(0x10,$inp)); 1049 &movups ($rndkey1,&QWP(0x20,$inp)); 1050 &xorps ($inout0,$inout4); 1051 &movups ($rndkey0,&QWP(0x30,$inp)); 1052 &xorps ($inout1,$inout5); 1053 &movups (&QWP(0,$out),$inout0); 1054 &xorps ($inout2,$rndkey1); 1055 &movups (&QWP(0x10,$out),$inout1); 1056 &xorps ($inout3,$rndkey0); 1057 &movups (&QWP(0x20,$out),$inout2); 1058 &movups (&QWP(0x30,$out),$inout3); 1059 1060 &set_label("ctr32_ret"); 1061 &mov ("esp",&DWP(80,"esp")); 1062 &function_end("aesni_ctr32_encrypt_blocks"); 1063 1065 ###################################################################### 1066 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1067 # const AES_KEY *key1, const AES_KEY *key2 1068 # const unsigned char iv[16]); 1069 # 1070 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1071 1072 &function_begin("aesni_xts_encrypt"); 1073 &mov ($key,&wparam(4)); # key2 1074 &mov ($inp,&wparam(5)); # clear-text tweak 1075 1076 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1077 &movups ($inout0,&QWP(0,$inp)); 1078 if ($inline) 1079 { &aesni_inline_generate1("enc"); } 1080 else 1081 { &call ("_aesni_encrypt1"); } 1082 1083 &mov ($inp,&wparam(0)); 1084 &mov ($out,&wparam(1)); 1085 &mov ($len,&wparam(2)); 1086 &mov ($key,&wparam(3)); # key1 1087 1088 &mov ($key_,"esp"); 1089 &sub ("esp",16*7+8); 1090 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1091 &and ("esp",-16); # align stack 1092 1093 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1094 &mov (&DWP(16*6+4,"esp"),0); 1095 &mov (&DWP(16*6+8,"esp"),1); 1096 &mov (&DWP(16*6+12,"esp"),0); 1097 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1098 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1099 1100 &movdqa ($tweak,$inout0); 1101 &pxor ($twtmp,$twtmp); 1102 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1103 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1104 1105 &and ($len,-16); 1106 &mov ($key_,$key); # backup $key 1107 &mov ($rounds_,$rounds); # backup $rounds 1108 &sub ($len,16*6); 1109 &jc (&label("xts_enc_short")); 1110 1111 &shl ($rounds,4); 1112 &mov ($rounds_,16); 1113 &sub ($rounds_,$rounds); 1114 &lea ($key,&DWP(32,$key,$rounds)); 1115 &jmp (&label("xts_enc_loop6")); 1116 1117 &set_label("xts_enc_loop6",16); 1118 for ($i=0;$i<4;$i++) { 1119 &pshufd ($twres,$twtmp,0x13); 1120 &pxor ($twtmp,$twtmp); 1121 &movdqa (&QWP(16*$i,"esp"),$tweak); 1122 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1123 &pand ($twres,$twmask); # isolate carry and residue 1124 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1125 &pxor ($tweak,$twres); 1126 } 1127 &pshufd ($inout5,$twtmp,0x13); 1128 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1129 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1130 &$movekey ($rndkey0,&QWP(0,$key_)); 1131 &pand ($inout5,$twmask); # isolate carry and residue 1132 &movups ($inout0,&QWP(0,$inp)); # load input 1133 &pxor ($inout5,$tweak); 1134 1135 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1136 &mov ($rounds,$rounds_); # restore $rounds 1137 &movdqu ($inout1,&QWP(16*1,$inp)); 1138 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1139 &movdqu ($inout2,&QWP(16*2,$inp)); 1140 &pxor ($inout1,$rndkey0); 1141 &movdqu ($inout3,&QWP(16*3,$inp)); 1142 &pxor ($inout2,$rndkey0); 1143 &movdqu ($inout4,&QWP(16*4,$inp)); 1144 &pxor ($inout3,$rndkey0); 1145 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1146 &pxor ($inout4,$rndkey0); 1147 &lea ($inp,&DWP(16*6,$inp)); 1148 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1149 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1150 &pxor ($inout5,$rndkey1); 1151 1152 &$movekey ($rndkey1,&QWP(16,$key_)); 1153 &pxor ($inout1,&QWP(16*1,"esp")); 1154 &pxor ($inout2,&QWP(16*2,"esp")); 1155 &aesenc ($inout0,$rndkey1); 1156 &pxor ($inout3,&QWP(16*3,"esp")); 1157 &pxor ($inout4,&QWP(16*4,"esp")); 1158 &aesenc ($inout1,$rndkey1); 1159 &pxor ($inout5,$rndkey0); 1160 &$movekey ($rndkey0,&QWP(32,$key_)); 1161 &aesenc ($inout2,$rndkey1); 1162 &aesenc ($inout3,$rndkey1); 1163 &aesenc ($inout4,$rndkey1); 1164 &aesenc ($inout5,$rndkey1); 1165 &call (&label("_aesni_encrypt6_enter")); 1166 1167 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1168 &pxor ($twtmp,$twtmp); 1169 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1170 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1171 &xorps ($inout1,&QWP(16*1,"esp")); 1172 &movups (&QWP(16*0,$out),$inout0); # write output 1173 &xorps ($inout2,&QWP(16*2,"esp")); 1174 &movups (&QWP(16*1,$out),$inout1); 1175 &xorps ($inout3,&QWP(16*3,"esp")); 1176 &movups (&QWP(16*2,$out),$inout2); 1177 &xorps ($inout4,&QWP(16*4,"esp")); 1178 &movups (&QWP(16*3,$out),$inout3); 1179 &xorps ($inout5,$tweak); 1180 &movups (&QWP(16*4,$out),$inout4); 1181 &pshufd ($twres,$twtmp,0x13); 1182 &movups (&QWP(16*5,$out),$inout5); 1183 &lea ($out,&DWP(16*6,$out)); 1184 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1185 1186 &pxor ($twtmp,$twtmp); 1187 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1188 &pand ($twres,$twmask); # isolate carry and residue 1189 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1190 &pxor ($tweak,$twres); 1191 1192 &sub ($len,16*6); 1193 &jnc (&label("xts_enc_loop6")); 1194 1195 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1196 &mov ($key,$key_); # restore $key 1197 &mov ($rounds_,$rounds); 1198 1199 &set_label("xts_enc_short"); 1200 &add ($len,16*6); 1201 &jz (&label("xts_enc_done6x")); 1202 1203 &movdqa ($inout3,$tweak); # put aside previous tweak 1204 &cmp ($len,0x20); 1205 &jb (&label("xts_enc_one")); 1206 1207 &pshufd ($twres,$twtmp,0x13); 1208 &pxor ($twtmp,$twtmp); 1209 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1210 &pand ($twres,$twmask); # isolate carry and residue 1211 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1212 &pxor ($tweak,$twres); 1213 &je (&label("xts_enc_two")); 1214 1215 &pshufd ($twres,$twtmp,0x13); 1216 &pxor ($twtmp,$twtmp); 1217 &movdqa ($inout4,$tweak); # put aside previous tweak 1218 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1219 &pand ($twres,$twmask); # isolate carry and residue 1220 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1221 &pxor ($tweak,$twres); 1222 &cmp ($len,0x40); 1223 &jb (&label("xts_enc_three")); 1224 1225 &pshufd ($twres,$twtmp,0x13); 1226 &pxor ($twtmp,$twtmp); 1227 &movdqa ($inout5,$tweak); # put aside previous tweak 1228 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1229 &pand ($twres,$twmask); # isolate carry and residue 1230 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1231 &pxor ($tweak,$twres); 1232 &movdqa (&QWP(16*0,"esp"),$inout3); 1233 &movdqa (&QWP(16*1,"esp"),$inout4); 1234 &je (&label("xts_enc_four")); 1235 1236 &movdqa (&QWP(16*2,"esp"),$inout5); 1237 &pshufd ($inout5,$twtmp,0x13); 1238 &movdqa (&QWP(16*3,"esp"),$tweak); 1239 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1240 &pand ($inout5,$twmask); # isolate carry and residue 1241 &pxor ($inout5,$tweak); 1242 1243 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1244 &movdqu ($inout1,&QWP(16*1,$inp)); 1245 &movdqu ($inout2,&QWP(16*2,$inp)); 1246 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1247 &movdqu ($inout3,&QWP(16*3,$inp)); 1248 &pxor ($inout1,&QWP(16*1,"esp")); 1249 &movdqu ($inout4,&QWP(16*4,$inp)); 1250 &pxor ($inout2,&QWP(16*2,"esp")); 1251 &lea ($inp,&DWP(16*5,$inp)); 1252 &pxor ($inout3,&QWP(16*3,"esp")); 1253 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1254 &pxor ($inout4,$inout5); 1255 1256 &call ("_aesni_encrypt6"); 1257 1258 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1259 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1260 &xorps ($inout1,&QWP(16*1,"esp")); 1261 &xorps ($inout2,&QWP(16*2,"esp")); 1262 &movups (&QWP(16*0,$out),$inout0); # write output 1263 &xorps ($inout3,&QWP(16*3,"esp")); 1264 &movups (&QWP(16*1,$out),$inout1); 1265 &xorps ($inout4,$tweak); 1266 &movups (&QWP(16*2,$out),$inout2); 1267 &movups (&QWP(16*3,$out),$inout3); 1268 &movups (&QWP(16*4,$out),$inout4); 1269 &lea ($out,&DWP(16*5,$out)); 1270 &jmp (&label("xts_enc_done")); 1271 1272 &set_label("xts_enc_one",16); 1273 &movups ($inout0,&QWP(16*0,$inp)); # load input 1274 &lea ($inp,&DWP(16*1,$inp)); 1275 &xorps ($inout0,$inout3); # input^=tweak 1276 if ($inline) 1277 { &aesni_inline_generate1("enc"); } 1278 else 1279 { &call ("_aesni_encrypt1"); } 1280 &xorps ($inout0,$inout3); # output^=tweak 1281 &movups (&QWP(16*0,$out),$inout0); # write output 1282 &lea ($out,&DWP(16*1,$out)); 1283 1284 &movdqa ($tweak,$inout3); # last tweak 1285 &jmp (&label("xts_enc_done")); 1286 1287 &set_label("xts_enc_two",16); 1288 &movaps ($inout4,$tweak); # put aside last tweak 1289 1290 &movups ($inout0,&QWP(16*0,$inp)); # load input 1291 &movups ($inout1,&QWP(16*1,$inp)); 1292 &lea ($inp,&DWP(16*2,$inp)); 1293 &xorps ($inout0,$inout3); # input^=tweak 1294 &xorps ($inout1,$inout4); 1295 1296 &call ("_aesni_encrypt2"); 1297 1298 &xorps ($inout0,$inout3); # output^=tweak 1299 &xorps ($inout1,$inout4); 1300 &movups (&QWP(16*0,$out),$inout0); # write output 1301 &movups (&QWP(16*1,$out),$inout1); 1302 &lea ($out,&DWP(16*2,$out)); 1303 1304 &movdqa ($tweak,$inout4); # last tweak 1305 &jmp (&label("xts_enc_done")); 1306 1307 &set_label("xts_enc_three",16); 1308 &movaps ($inout5,$tweak); # put aside last tweak 1309 &movups ($inout0,&QWP(16*0,$inp)); # load input 1310 &movups ($inout1,&QWP(16*1,$inp)); 1311 &movups ($inout2,&QWP(16*2,$inp)); 1312 &lea ($inp,&DWP(16*3,$inp)); 1313 &xorps ($inout0,$inout3); # input^=tweak 1314 &xorps ($inout1,$inout4); 1315 &xorps ($inout2,$inout5); 1316 1317 &call ("_aesni_encrypt3"); 1318 1319 &xorps ($inout0,$inout3); # output^=tweak 1320 &xorps ($inout1,$inout4); 1321 &xorps ($inout2,$inout5); 1322 &movups (&QWP(16*0,$out),$inout0); # write output 1323 &movups (&QWP(16*1,$out),$inout1); 1324 &movups (&QWP(16*2,$out),$inout2); 1325 &lea ($out,&DWP(16*3,$out)); 1326 1327 &movdqa ($tweak,$inout5); # last tweak 1328 &jmp (&label("xts_enc_done")); 1329 1330 &set_label("xts_enc_four",16); 1331 &movaps ($inout4,$tweak); # put aside last tweak 1332 1333 &movups ($inout0,&QWP(16*0,$inp)); # load input 1334 &movups ($inout1,&QWP(16*1,$inp)); 1335 &movups ($inout2,&QWP(16*2,$inp)); 1336 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1337 &movups ($inout3,&QWP(16*3,$inp)); 1338 &lea ($inp,&DWP(16*4,$inp)); 1339 &xorps ($inout1,&QWP(16*1,"esp")); 1340 &xorps ($inout2,$inout5); 1341 &xorps ($inout3,$inout4); 1342 1343 &call ("_aesni_encrypt4"); 1344 1345 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1346 &xorps ($inout1,&QWP(16*1,"esp")); 1347 &xorps ($inout2,$inout5); 1348 &movups (&QWP(16*0,$out),$inout0); # write output 1349 &xorps ($inout3,$inout4); 1350 &movups (&QWP(16*1,$out),$inout1); 1351 &movups (&QWP(16*2,$out),$inout2); 1352 &movups (&QWP(16*3,$out),$inout3); 1353 &lea ($out,&DWP(16*4,$out)); 1354 1355 &movdqa ($tweak,$inout4); # last tweak 1356 &jmp (&label("xts_enc_done")); 1357 1358 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1359 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1360 &and ($len,15); 1361 &jz (&label("xts_enc_ret")); 1362 &movdqa ($inout3,$tweak); 1363 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1364 &jmp (&label("xts_enc_steal")); 1365 1366 &set_label("xts_enc_done",16); 1367 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1368 &pxor ($twtmp,$twtmp); 1369 &and ($len,15); 1370 &jz (&label("xts_enc_ret")); 1371 1372 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1373 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1374 &pshufd ($inout3,$twtmp,0x13); 1375 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1376 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1377 &pxor ($inout3,$tweak); 1378 1379 &set_label("xts_enc_steal"); 1380 &movz ($rounds,&BP(0,$inp)); 1381 &movz ($key,&BP(-16,$out)); 1382 &lea ($inp,&DWP(1,$inp)); 1383 &mov (&BP(-16,$out),&LB($rounds)); 1384 &mov (&BP(0,$out),&LB($key)); 1385 &lea ($out,&DWP(1,$out)); 1386 &sub ($len,1); 1387 &jnz (&label("xts_enc_steal")); 1388 1389 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1390 &mov ($key,$key_); # restore $key 1391 &mov ($rounds,$rounds_); # restore $rounds 1392 1393 &movups ($inout0,&QWP(-16,$out)); # load input 1394 &xorps ($inout0,$inout3); # input^=tweak 1395 if ($inline) 1396 { &aesni_inline_generate1("enc"); } 1397 else 1398 { &call ("_aesni_encrypt1"); } 1399 &xorps ($inout0,$inout3); # output^=tweak 1400 &movups (&QWP(-16,$out),$inout0); # write output 1401 1402 &set_label("xts_enc_ret"); 1403 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1404 &function_end("aesni_xts_encrypt"); 1405 1406 &function_begin("aesni_xts_decrypt"); 1407 &mov ($key,&wparam(4)); # key2 1408 &mov ($inp,&wparam(5)); # clear-text tweak 1409 1410 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1411 &movups ($inout0,&QWP(0,$inp)); 1412 if ($inline) 1413 { &aesni_inline_generate1("enc"); } 1414 else 1415 { &call ("_aesni_encrypt1"); } 1416 1417 &mov ($inp,&wparam(0)); 1418 &mov ($out,&wparam(1)); 1419 &mov ($len,&wparam(2)); 1420 &mov ($key,&wparam(3)); # key1 1421 1422 &mov ($key_,"esp"); 1423 &sub ("esp",16*7+8); 1424 &and ("esp",-16); # align stack 1425 1426 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1427 &test ($len,15); 1428 &setnz (&LB($rounds_)); 1429 &shl ($rounds_,4); 1430 &sub ($len,$rounds_); 1431 1432 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1433 &mov (&DWP(16*6+4,"esp"),0); 1434 &mov (&DWP(16*6+8,"esp"),1); 1435 &mov (&DWP(16*6+12,"esp"),0); 1436 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1437 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1438 1439 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1440 &mov ($key_,$key); # backup $key 1441 &mov ($rounds_,$rounds); # backup $rounds 1442 1443 &movdqa ($tweak,$inout0); 1444 &pxor ($twtmp,$twtmp); 1445 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1446 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1447 1448 &and ($len,-16); 1449 &sub ($len,16*6); 1450 &jc (&label("xts_dec_short")); 1451 1452 &shl ($rounds,4); 1453 &mov ($rounds_,16); 1454 &sub ($rounds_,$rounds); 1455 &lea ($key,&DWP(32,$key,$rounds)); 1456 &jmp (&label("xts_dec_loop6")); 1457 1458 &set_label("xts_dec_loop6",16); 1459 for ($i=0;$i<4;$i++) { 1460 &pshufd ($twres,$twtmp,0x13); 1461 &pxor ($twtmp,$twtmp); 1462 &movdqa (&QWP(16*$i,"esp"),$tweak); 1463 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1464 &pand ($twres,$twmask); # isolate carry and residue 1465 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1466 &pxor ($tweak,$twres); 1467 } 1468 &pshufd ($inout5,$twtmp,0x13); 1469 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1470 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1471 &$movekey ($rndkey0,&QWP(0,$key_)); 1472 &pand ($inout5,$twmask); # isolate carry and residue 1473 &movups ($inout0,&QWP(0,$inp)); # load input 1474 &pxor ($inout5,$tweak); 1475 1476 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1477 &mov ($rounds,$rounds_); 1478 &movdqu ($inout1,&QWP(16*1,$inp)); 1479 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1480 &movdqu ($inout2,&QWP(16*2,$inp)); 1481 &pxor ($inout1,$rndkey0); 1482 &movdqu ($inout3,&QWP(16*3,$inp)); 1483 &pxor ($inout2,$rndkey0); 1484 &movdqu ($inout4,&QWP(16*4,$inp)); 1485 &pxor ($inout3,$rndkey0); 1486 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1487 &pxor ($inout4,$rndkey0); 1488 &lea ($inp,&DWP(16*6,$inp)); 1489 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1490 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1491 &pxor ($inout5,$rndkey1); 1492 1493 &$movekey ($rndkey1,&QWP(16,$key_)); 1494 &pxor ($inout1,&QWP(16*1,"esp")); 1495 &pxor ($inout2,&QWP(16*2,"esp")); 1496 &aesdec ($inout0,$rndkey1); 1497 &pxor ($inout3,&QWP(16*3,"esp")); 1498 &pxor ($inout4,&QWP(16*4,"esp")); 1499 &aesdec ($inout1,$rndkey1); 1500 &pxor ($inout5,$rndkey0); 1501 &$movekey ($rndkey0,&QWP(32,$key_)); 1502 &aesdec ($inout2,$rndkey1); 1503 &aesdec ($inout3,$rndkey1); 1504 &aesdec ($inout4,$rndkey1); 1505 &aesdec ($inout5,$rndkey1); 1506 &call (&label("_aesni_decrypt6_enter")); 1507 1508 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1509 &pxor ($twtmp,$twtmp); 1510 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1511 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1512 &xorps ($inout1,&QWP(16*1,"esp")); 1513 &movups (&QWP(16*0,$out),$inout0); # write output 1514 &xorps ($inout2,&QWP(16*2,"esp")); 1515 &movups (&QWP(16*1,$out),$inout1); 1516 &xorps ($inout3,&QWP(16*3,"esp")); 1517 &movups (&QWP(16*2,$out),$inout2); 1518 &xorps ($inout4,&QWP(16*4,"esp")); 1519 &movups (&QWP(16*3,$out),$inout3); 1520 &xorps ($inout5,$tweak); 1521 &movups (&QWP(16*4,$out),$inout4); 1522 &pshufd ($twres,$twtmp,0x13); 1523 &movups (&QWP(16*5,$out),$inout5); 1524 &lea ($out,&DWP(16*6,$out)); 1525 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1526 1527 &pxor ($twtmp,$twtmp); 1528 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1529 &pand ($twres,$twmask); # isolate carry and residue 1530 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1531 &pxor ($tweak,$twres); 1532 1533 &sub ($len,16*6); 1534 &jnc (&label("xts_dec_loop6")); 1535 1536 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1537 &mov ($key,$key_); # restore $key 1538 &mov ($rounds_,$rounds); 1539 1540 &set_label("xts_dec_short"); 1541 &add ($len,16*6); 1542 &jz (&label("xts_dec_done6x")); 1543 1544 &movdqa ($inout3,$tweak); # put aside previous tweak 1545 &cmp ($len,0x20); 1546 &jb (&label("xts_dec_one")); 1547 1548 &pshufd ($twres,$twtmp,0x13); 1549 &pxor ($twtmp,$twtmp); 1550 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1551 &pand ($twres,$twmask); # isolate carry and residue 1552 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1553 &pxor ($tweak,$twres); 1554 &je (&label("xts_dec_two")); 1555 1556 &pshufd ($twres,$twtmp,0x13); 1557 &pxor ($twtmp,$twtmp); 1558 &movdqa ($inout4,$tweak); # put aside previous tweak 1559 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1560 &pand ($twres,$twmask); # isolate carry and residue 1561 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1562 &pxor ($tweak,$twres); 1563 &cmp ($len,0x40); 1564 &jb (&label("xts_dec_three")); 1565 1566 &pshufd ($twres,$twtmp,0x13); 1567 &pxor ($twtmp,$twtmp); 1568 &movdqa ($inout5,$tweak); # put aside previous tweak 1569 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1570 &pand ($twres,$twmask); # isolate carry and residue 1571 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1572 &pxor ($tweak,$twres); 1573 &movdqa (&QWP(16*0,"esp"),$inout3); 1574 &movdqa (&QWP(16*1,"esp"),$inout4); 1575 &je (&label("xts_dec_four")); 1576 1577 &movdqa (&QWP(16*2,"esp"),$inout5); 1578 &pshufd ($inout5,$twtmp,0x13); 1579 &movdqa (&QWP(16*3,"esp"),$tweak); 1580 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1581 &pand ($inout5,$twmask); # isolate carry and residue 1582 &pxor ($inout5,$tweak); 1583 1584 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1585 &movdqu ($inout1,&QWP(16*1,$inp)); 1586 &movdqu ($inout2,&QWP(16*2,$inp)); 1587 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1588 &movdqu ($inout3,&QWP(16*3,$inp)); 1589 &pxor ($inout1,&QWP(16*1,"esp")); 1590 &movdqu ($inout4,&QWP(16*4,$inp)); 1591 &pxor ($inout2,&QWP(16*2,"esp")); 1592 &lea ($inp,&DWP(16*5,$inp)); 1593 &pxor ($inout3,&QWP(16*3,"esp")); 1594 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1595 &pxor ($inout4,$inout5); 1596 1597 &call ("_aesni_decrypt6"); 1598 1599 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1600 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1601 &xorps ($inout1,&QWP(16*1,"esp")); 1602 &xorps ($inout2,&QWP(16*2,"esp")); 1603 &movups (&QWP(16*0,$out),$inout0); # write output 1604 &xorps ($inout3,&QWP(16*3,"esp")); 1605 &movups (&QWP(16*1,$out),$inout1); 1606 &xorps ($inout4,$tweak); 1607 &movups (&QWP(16*2,$out),$inout2); 1608 &movups (&QWP(16*3,$out),$inout3); 1609 &movups (&QWP(16*4,$out),$inout4); 1610 &lea ($out,&DWP(16*5,$out)); 1611 &jmp (&label("xts_dec_done")); 1612 1613 &set_label("xts_dec_one",16); 1614 &movups ($inout0,&QWP(16*0,$inp)); # load input 1615 &lea ($inp,&DWP(16*1,$inp)); 1616 &xorps ($inout0,$inout3); # input^=tweak 1617 if ($inline) 1618 { &aesni_inline_generate1("dec"); } 1619 else 1620 { &call ("_aesni_decrypt1"); } 1621 &xorps ($inout0,$inout3); # output^=tweak 1622 &movups (&QWP(16*0,$out),$inout0); # write output 1623 &lea ($out,&DWP(16*1,$out)); 1624 1625 &movdqa ($tweak,$inout3); # last tweak 1626 &jmp (&label("xts_dec_done")); 1627 1628 &set_label("xts_dec_two",16); 1629 &movaps ($inout4,$tweak); # put aside last tweak 1630 1631 &movups ($inout0,&QWP(16*0,$inp)); # load input 1632 &movups ($inout1,&QWP(16*1,$inp)); 1633 &lea ($inp,&DWP(16*2,$inp)); 1634 &xorps ($inout0,$inout3); # input^=tweak 1635 &xorps ($inout1,$inout4); 1636 1637 &call ("_aesni_decrypt2"); 1638 1639 &xorps ($inout0,$inout3); # output^=tweak 1640 &xorps ($inout1,$inout4); 1641 &movups (&QWP(16*0,$out),$inout0); # write output 1642 &movups (&QWP(16*1,$out),$inout1); 1643 &lea ($out,&DWP(16*2,$out)); 1644 1645 &movdqa ($tweak,$inout4); # last tweak 1646 &jmp (&label("xts_dec_done")); 1647 1648 &set_label("xts_dec_three",16); 1649 &movaps ($inout5,$tweak); # put aside last tweak 1650 &movups ($inout0,&QWP(16*0,$inp)); # load input 1651 &movups ($inout1,&QWP(16*1,$inp)); 1652 &movups ($inout2,&QWP(16*2,$inp)); 1653 &lea ($inp,&DWP(16*3,$inp)); 1654 &xorps ($inout0,$inout3); # input^=tweak 1655 &xorps ($inout1,$inout4); 1656 &xorps ($inout2,$inout5); 1657 1658 &call ("_aesni_decrypt3"); 1659 1660 &xorps ($inout0,$inout3); # output^=tweak 1661 &xorps ($inout1,$inout4); 1662 &xorps ($inout2,$inout5); 1663 &movups (&QWP(16*0,$out),$inout0); # write output 1664 &movups (&QWP(16*1,$out),$inout1); 1665 &movups (&QWP(16*2,$out),$inout2); 1666 &lea ($out,&DWP(16*3,$out)); 1667 1668 &movdqa ($tweak,$inout5); # last tweak 1669 &jmp (&label("xts_dec_done")); 1670 1671 &set_label("xts_dec_four",16); 1672 &movaps ($inout4,$tweak); # put aside last tweak 1673 1674 &movups ($inout0,&QWP(16*0,$inp)); # load input 1675 &movups ($inout1,&QWP(16*1,$inp)); 1676 &movups ($inout2,&QWP(16*2,$inp)); 1677 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1678 &movups ($inout3,&QWP(16*3,$inp)); 1679 &lea ($inp,&DWP(16*4,$inp)); 1680 &xorps ($inout1,&QWP(16*1,"esp")); 1681 &xorps ($inout2,$inout5); 1682 &xorps ($inout3,$inout4); 1683 1684 &call ("_aesni_decrypt4"); 1685 1686 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1687 &xorps ($inout1,&QWP(16*1,"esp")); 1688 &xorps ($inout2,$inout5); 1689 &movups (&QWP(16*0,$out),$inout0); # write output 1690 &xorps ($inout3,$inout4); 1691 &movups (&QWP(16*1,$out),$inout1); 1692 &movups (&QWP(16*2,$out),$inout2); 1693 &movups (&QWP(16*3,$out),$inout3); 1694 &lea ($out,&DWP(16*4,$out)); 1695 1696 &movdqa ($tweak,$inout4); # last tweak 1697 &jmp (&label("xts_dec_done")); 1698 1699 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1700 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1701 &and ($len,15); 1702 &jz (&label("xts_dec_ret")); 1703 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1704 &jmp (&label("xts_dec_only_one_more")); 1705 1706 &set_label("xts_dec_done",16); 1707 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1708 &pxor ($twtmp,$twtmp); 1709 &and ($len,15); 1710 &jz (&label("xts_dec_ret")); 1711 1712 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1713 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1714 &pshufd ($twres,$twtmp,0x13); 1715 &pxor ($twtmp,$twtmp); 1716 &movdqa ($twmask,&QWP(16*6,"esp")); 1717 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1718 &pand ($twres,$twmask); # isolate carry and residue 1719 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1720 &pxor ($tweak,$twres); 1721 1722 &set_label("xts_dec_only_one_more"); 1723 &pshufd ($inout3,$twtmp,0x13); 1724 &movdqa ($inout4,$tweak); # put aside previous tweak 1725 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1726 &pand ($inout3,$twmask); # isolate carry and residue 1727 &pxor ($inout3,$tweak); 1728 1729 &mov ($key,$key_); # restore $key 1730 &mov ($rounds,$rounds_); # restore $rounds 1731 1732 &movups ($inout0,&QWP(0,$inp)); # load input 1733 &xorps ($inout0,$inout3); # input^=tweak 1734 if ($inline) 1735 { &aesni_inline_generate1("dec"); } 1736 else 1737 { &call ("_aesni_decrypt1"); } 1738 &xorps ($inout0,$inout3); # output^=tweak 1739 &movups (&QWP(0,$out),$inout0); # write output 1740 1741 &set_label("xts_dec_steal"); 1742 &movz ($rounds,&BP(16,$inp)); 1743 &movz ($key,&BP(0,$out)); 1744 &lea ($inp,&DWP(1,$inp)); 1745 &mov (&BP(0,$out),&LB($rounds)); 1746 &mov (&BP(16,$out),&LB($key)); 1747 &lea ($out,&DWP(1,$out)); 1748 &sub ($len,1); 1749 &jnz (&label("xts_dec_steal")); 1750 1751 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1752 &mov ($key,$key_); # restore $key 1753 &mov ($rounds,$rounds_); # restore $rounds 1754 1755 &movups ($inout0,&QWP(0,$out)); # load input 1756 &xorps ($inout0,$inout4); # input^=tweak 1757 if ($inline) 1758 { &aesni_inline_generate1("dec"); } 1759 else 1760 { &call ("_aesni_decrypt1"); } 1761 &xorps ($inout0,$inout4); # output^=tweak 1762 &movups (&QWP(0,$out),$inout0); # write output 1763 1764 &set_label("xts_dec_ret"); 1765 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1766 &function_end("aesni_xts_decrypt"); 1767 } 1768 } 1769 1771 ###################################################################### 1772 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 1773 # size_t length, const AES_KEY *key, 1774 # unsigned char *ivp,const int enc); 1775 &function_begin("${PREFIX}_cbc_encrypt"); 1776 &mov ($inp,&wparam(0)); 1777 &mov ($rounds_,"esp"); 1778 &mov ($out,&wparam(1)); 1779 &sub ($rounds_,24); 1780 &mov ($len,&wparam(2)); 1781 &and ($rounds_,-16); 1782 &mov ($key,&wparam(3)); 1783 &mov ($key_,&wparam(4)); 1784 &test ($len,$len); 1785 &jz (&label("cbc_abort")); 1786 1787 &cmp (&wparam(5),0); 1788 &xchg ($rounds_,"esp"); # alloca 1789 &movups ($ivec,&QWP(0,$key_)); # load IV 1790 &mov ($rounds,&DWP(240,$key)); 1791 &mov ($key_,$key); # backup $key 1792 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1793 &mov ($rounds_,$rounds); # backup $rounds 1794 &je (&label("cbc_decrypt")); 1795 1796 &movaps ($inout0,$ivec); 1797 &cmp ($len,16); 1798 &jb (&label("cbc_enc_tail")); 1799 &sub ($len,16); 1800 &jmp (&label("cbc_enc_loop")); 1801 1802 &set_label("cbc_enc_loop",16); 1803 &movups ($ivec,&QWP(0,$inp)); # input actually 1804 &lea ($inp,&DWP(16,$inp)); 1805 if ($inline) 1806 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1807 else 1808 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1809 &mov ($rounds,$rounds_); # restore $rounds 1810 &mov ($key,$key_); # restore $key 1811 &movups (&QWP(0,$out),$inout0); # store output 1812 &lea ($out,&DWP(16,$out)); 1813 &sub ($len,16); 1814 &jnc (&label("cbc_enc_loop")); 1815 &add ($len,16); 1816 &jnz (&label("cbc_enc_tail")); 1817 &movaps ($ivec,$inout0); 1818 &jmp (&label("cbc_ret")); 1819 1820 &set_label("cbc_enc_tail"); 1821 &mov ("ecx",$len); # zaps $rounds 1822 &data_word(0xA4F3F689); # rep movsb 1823 &mov ("ecx",16); # zero tail 1824 &sub ("ecx",$len); 1825 &xor ("eax","eax"); # zaps $len 1826 &data_word(0xAAF3F689); # rep stosb 1827 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1828 &mov ($rounds,$rounds_); # restore $rounds 1829 &mov ($inp,$out); # $inp and $out are the same 1830 &mov ($key,$key_); # restore $key 1831 &jmp (&label("cbc_enc_loop")); 1832 ###################################################################### 1833 &set_label("cbc_decrypt",16); 1834 &cmp ($len,0x50); 1835 &jbe (&label("cbc_dec_tail")); 1836 &movaps (&QWP(0,"esp"),$ivec); # save IV 1837 &sub ($len,0x50); 1838 &jmp (&label("cbc_dec_loop6_enter")); 1839 1840 &set_label("cbc_dec_loop6",16); 1841 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1842 &movups (&QWP(0,$out),$inout5); 1843 &lea ($out,&DWP(0x10,$out)); 1844 &set_label("cbc_dec_loop6_enter"); 1845 &movdqu ($inout0,&QWP(0,$inp)); 1846 &movdqu ($inout1,&QWP(0x10,$inp)); 1847 &movdqu ($inout2,&QWP(0x20,$inp)); 1848 &movdqu ($inout3,&QWP(0x30,$inp)); 1849 &movdqu ($inout4,&QWP(0x40,$inp)); 1850 &movdqu ($inout5,&QWP(0x50,$inp)); 1851 1852 &call ("_aesni_decrypt6"); 1853 1854 &movups ($rndkey1,&QWP(0,$inp)); 1855 &movups ($rndkey0,&QWP(0x10,$inp)); 1856 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1857 &xorps ($inout1,$rndkey1); 1858 &movups ($rndkey1,&QWP(0x20,$inp)); 1859 &xorps ($inout2,$rndkey0); 1860 &movups ($rndkey0,&QWP(0x30,$inp)); 1861 &xorps ($inout3,$rndkey1); 1862 &movups ($rndkey1,&QWP(0x40,$inp)); 1863 &xorps ($inout4,$rndkey0); 1864 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1865 &xorps ($inout5,$rndkey1); 1866 &movups (&QWP(0,$out),$inout0); 1867 &movups (&QWP(0x10,$out),$inout1); 1868 &lea ($inp,&DWP(0x60,$inp)); 1869 &movups (&QWP(0x20,$out),$inout2); 1870 &mov ($rounds,$rounds_); # restore $rounds 1871 &movups (&QWP(0x30,$out),$inout3); 1872 &mov ($key,$key_); # restore $key 1873 &movups (&QWP(0x40,$out),$inout4); 1874 &lea ($out,&DWP(0x50,$out)); 1875 &sub ($len,0x60); 1876 &ja (&label("cbc_dec_loop6")); 1877 1878 &movaps ($inout0,$inout5); 1879 &movaps ($ivec,$rndkey0); 1880 &add ($len,0x50); 1881 &jle (&label("cbc_dec_tail_collected")); 1882 &movups (&QWP(0,$out),$inout0); 1883 &lea ($out,&DWP(0x10,$out)); 1884 &set_label("cbc_dec_tail"); 1885 &movups ($inout0,&QWP(0,$inp)); 1886 &movaps ($in0,$inout0); 1887 &cmp ($len,0x10); 1888 &jbe (&label("cbc_dec_one")); 1889 1890 &movups ($inout1,&QWP(0x10,$inp)); 1891 &movaps ($in1,$inout1); 1892 &cmp ($len,0x20); 1893 &jbe (&label("cbc_dec_two")); 1894 1895 &movups ($inout2,&QWP(0x20,$inp)); 1896 &cmp ($len,0x30); 1897 &jbe (&label("cbc_dec_three")); 1898 1899 &movups ($inout3,&QWP(0x30,$inp)); 1900 &cmp ($len,0x40); 1901 &jbe (&label("cbc_dec_four")); 1902 1903 &movups ($inout4,&QWP(0x40,$inp)); 1904 &movaps (&QWP(0,"esp"),$ivec); # save IV 1905 &movups ($inout0,&QWP(0,$inp)); 1906 &xorps ($inout5,$inout5); 1907 &call ("_aesni_decrypt6"); 1908 &movups ($rndkey1,&QWP(0,$inp)); 1909 &movups ($rndkey0,&QWP(0x10,$inp)); 1910 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1911 &xorps ($inout1,$rndkey1); 1912 &movups ($rndkey1,&QWP(0x20,$inp)); 1913 &xorps ($inout2,$rndkey0); 1914 &movups ($rndkey0,&QWP(0x30,$inp)); 1915 &xorps ($inout3,$rndkey1); 1916 &movups ($ivec,&QWP(0x40,$inp)); # IV 1917 &xorps ($inout4,$rndkey0); 1918 &movups (&QWP(0,$out),$inout0); 1919 &movups (&QWP(0x10,$out),$inout1); 1920 &movups (&QWP(0x20,$out),$inout2); 1921 &movups (&QWP(0x30,$out),$inout3); 1922 &lea ($out,&DWP(0x40,$out)); 1923 &movaps ($inout0,$inout4); 1924 &sub ($len,0x50); 1925 &jmp (&label("cbc_dec_tail_collected")); 1926 1927 &set_label("cbc_dec_one",16); 1928 if ($inline) 1929 { &aesni_inline_generate1("dec"); } 1930 else 1931 { &call ("_aesni_decrypt1"); } 1932 &xorps ($inout0,$ivec); 1933 &movaps ($ivec,$in0); 1934 &sub ($len,0x10); 1935 &jmp (&label("cbc_dec_tail_collected")); 1936 1937 &set_label("cbc_dec_two",16); 1938 &call ("_aesni_decrypt2"); 1939 &xorps ($inout0,$ivec); 1940 &xorps ($inout1,$in0); 1941 &movups (&QWP(0,$out),$inout0); 1942 &movaps ($inout0,$inout1); 1943 &lea ($out,&DWP(0x10,$out)); 1944 &movaps ($ivec,$in1); 1945 &sub ($len,0x20); 1946 &jmp (&label("cbc_dec_tail_collected")); 1947 1948 &set_label("cbc_dec_three",16); 1949 &call ("_aesni_decrypt3"); 1950 &xorps ($inout0,$ivec); 1951 &xorps ($inout1,$in0); 1952 &xorps ($inout2,$in1); 1953 &movups (&QWP(0,$out),$inout0); 1954 &movaps ($inout0,$inout2); 1955 &movups (&QWP(0x10,$out),$inout1); 1956 &lea ($out,&DWP(0x20,$out)); 1957 &movups ($ivec,&QWP(0x20,$inp)); 1958 &sub ($len,0x30); 1959 &jmp (&label("cbc_dec_tail_collected")); 1960 1961 &set_label("cbc_dec_four",16); 1962 &call ("_aesni_decrypt4"); 1963 &movups ($rndkey1,&QWP(0x10,$inp)); 1964 &movups ($rndkey0,&QWP(0x20,$inp)); 1965 &xorps ($inout0,$ivec); 1966 &movups ($ivec,&QWP(0x30,$inp)); 1967 &xorps ($inout1,$in0); 1968 &movups (&QWP(0,$out),$inout0); 1969 &xorps ($inout2,$rndkey1); 1970 &movups (&QWP(0x10,$out),$inout1); 1971 &xorps ($inout3,$rndkey0); 1972 &movups (&QWP(0x20,$out),$inout2); 1973 &lea ($out,&DWP(0x30,$out)); 1974 &movaps ($inout0,$inout3); 1975 &sub ($len,0x40); 1976 1977 &set_label("cbc_dec_tail_collected"); 1978 &and ($len,15); 1979 &jnz (&label("cbc_dec_tail_partial")); 1980 &movups (&QWP(0,$out),$inout0); 1981 &jmp (&label("cbc_ret")); 1982 1983 &set_label("cbc_dec_tail_partial",16); 1984 &movaps (&QWP(0,"esp"),$inout0); 1985 &mov ("ecx",16); 1986 &mov ($inp,"esp"); 1987 &sub ("ecx",$len); 1988 &data_word(0xA4F3F689); # rep movsb 1989 1990 &set_label("cbc_ret"); 1991 &mov ("esp",&DWP(16,"esp")); # pull original %esp 1992 &mov ($key_,&wparam(4)); 1993 &movups (&QWP(0,$key_),$ivec); # output IV 1994 &set_label("cbc_abort"); 1995 &function_end("${PREFIX}_cbc_encrypt"); 1996 1998 ###################################################################### 1999 # Mechanical port from aesni-x86_64.pl. 2000 # 2001 # _aesni_set_encrypt_key is private interface, 2002 # input: 2003 # "eax" const unsigned char *userKey 2004 # $rounds int bits 2005 # $key AES_KEY *key 2006 # output: 2007 # "eax" return code 2008 # $round rounds 2009 2010 &function_begin_B("_aesni_set_encrypt_key"); 2011 &test ("eax","eax"); 2012 &jz (&label("bad_pointer")); 2013 &test ($key,$key); 2014 &jz (&label("bad_pointer")); 2015 2016 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2017 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2018 &lea ($key,&DWP(16,$key)); 2019 &cmp ($rounds,256); 2020 &je (&label("14rounds")); 2021 &cmp ($rounds,192); 2022 &je (&label("12rounds")); 2023 &cmp ($rounds,128); 2024 &jne (&label("bad_keybits")); 2025 2026 &set_label("10rounds",16); 2027 &mov ($rounds,9); 2028 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2029 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 2030 &call (&label("key_128_cold")); 2031 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 2032 &call (&label("key_128")); 2033 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 2034 &call (&label("key_128")); 2035 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 2036 &call (&label("key_128")); 2037 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 2038 &call (&label("key_128")); 2039 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 2040 &call (&label("key_128")); 2041 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 2042 &call (&label("key_128")); 2043 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2044 &call (&label("key_128")); 2045 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2046 &call (&label("key_128")); 2047 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2048 &call (&label("key_128")); 2049 &$movekey (&QWP(0,$key),"xmm0"); 2050 &mov (&DWP(80,$key),$rounds); 2051 &xor ("eax","eax"); 2052 &ret(); 2053 2054 &set_label("key_128",16); 2055 &$movekey (&QWP(0,$key),"xmm0"); 2056 &lea ($key,&DWP(16,$key)); 2057 &set_label("key_128_cold"); 2058 &shufps ("xmm4","xmm0",0b00010000); 2059 &xorps ("xmm0","xmm4"); 2060 &shufps ("xmm4","xmm0",0b10001100); 2061 &xorps ("xmm0","xmm4"); 2062 &shufps ("xmm1","xmm1",0b11111111); # critical path 2063 &xorps ("xmm0","xmm1"); 2064 &ret(); 2065 2066 &set_label("12rounds",16); 2067 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2068 &mov ($rounds,11); 2069 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2070 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2071 &call (&label("key_192a_cold")); 2072 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2073 &call (&label("key_192b")); 2074 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2075 &call (&label("key_192a")); 2076 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2077 &call (&label("key_192b")); 2078 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2079 &call (&label("key_192a")); 2080 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2081 &call (&label("key_192b")); 2082 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2083 &call (&label("key_192a")); 2084 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2085 &call (&label("key_192b")); 2086 &$movekey (&QWP(0,$key),"xmm0"); 2087 &mov (&DWP(48,$key),$rounds); 2088 &xor ("eax","eax"); 2089 &ret(); 2090 2091 &set_label("key_192a",16); 2092 &$movekey (&QWP(0,$key),"xmm0"); 2093 &lea ($key,&DWP(16,$key)); 2094 &set_label("key_192a_cold",16); 2095 &movaps ("xmm5","xmm2"); 2096 &set_label("key_192b_warm"); 2097 &shufps ("xmm4","xmm0",0b00010000); 2098 &movdqa ("xmm3","xmm2"); 2099 &xorps ("xmm0","xmm4"); 2100 &shufps ("xmm4","xmm0",0b10001100); 2101 &pslldq ("xmm3",4); 2102 &xorps ("xmm0","xmm4"); 2103 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2104 &pxor ("xmm2","xmm3"); 2105 &pxor ("xmm0","xmm1"); 2106 &pshufd ("xmm3","xmm0",0b11111111); 2107 &pxor ("xmm2","xmm3"); 2108 &ret(); 2109 2110 &set_label("key_192b",16); 2111 &movaps ("xmm3","xmm0"); 2112 &shufps ("xmm5","xmm0",0b01000100); 2113 &$movekey (&QWP(0,$key),"xmm5"); 2114 &shufps ("xmm3","xmm2",0b01001110); 2115 &$movekey (&QWP(16,$key),"xmm3"); 2116 &lea ($key,&DWP(32,$key)); 2117 &jmp (&label("key_192b_warm")); 2118 2119 &set_label("14rounds",16); 2120 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2121 &mov ($rounds,13); 2122 &lea ($key,&DWP(16,$key)); 2123 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2124 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2125 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2126 &call (&label("key_256a_cold")); 2127 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2128 &call (&label("key_256b")); 2129 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2130 &call (&label("key_256a")); 2131 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2132 &call (&label("key_256b")); 2133 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2134 &call (&label("key_256a")); 2135 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2136 &call (&label("key_256b")); 2137 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2138 &call (&label("key_256a")); 2139 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2140 &call (&label("key_256b")); 2141 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2142 &call (&label("key_256a")); 2143 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2144 &call (&label("key_256b")); 2145 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2146 &call (&label("key_256a")); 2147 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2148 &call (&label("key_256b")); 2149 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2150 &call (&label("key_256a")); 2151 &$movekey (&QWP(0,$key),"xmm0"); 2152 &mov (&DWP(16,$key),$rounds); 2153 &xor ("eax","eax"); 2154 &ret(); 2155 2156 &set_label("key_256a",16); 2157 &$movekey (&QWP(0,$key),"xmm2"); 2158 &lea ($key,&DWP(16,$key)); 2159 &set_label("key_256a_cold"); 2160 &shufps ("xmm4","xmm0",0b00010000); 2161 &xorps ("xmm0","xmm4"); 2162 &shufps ("xmm4","xmm0",0b10001100); 2163 &xorps ("xmm0","xmm4"); 2164 &shufps ("xmm1","xmm1",0b11111111); # critical path 2165 &xorps ("xmm0","xmm1"); 2166 &ret(); 2167 2168 &set_label("key_256b",16); 2169 &$movekey (&QWP(0,$key),"xmm0"); 2170 &lea ($key,&DWP(16,$key)); 2171 2172 &shufps ("xmm4","xmm2",0b00010000); 2173 &xorps ("xmm2","xmm4"); 2174 &shufps ("xmm4","xmm2",0b10001100); 2175 &xorps ("xmm2","xmm4"); 2176 &shufps ("xmm1","xmm1",0b10101010); # critical path 2177 &xorps ("xmm2","xmm1"); 2178 &ret(); 2179 2180 &set_label("bad_pointer",4); 2181 &mov ("eax",-1); 2182 &ret (); 2183 &set_label("bad_keybits",4); 2184 &mov ("eax",-2); 2185 &ret (); 2186 &function_end_B("_aesni_set_encrypt_key"); 2187 2188 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2189 # AES_KEY *key) 2190 &function_begin_B("${PREFIX}_set_encrypt_key"); 2191 &mov ("eax",&wparam(0)); 2192 &mov ($rounds,&wparam(1)); 2193 &mov ($key,&wparam(2)); 2194 &call ("_aesni_set_encrypt_key"); 2195 &ret (); 2196 &function_end_B("${PREFIX}_set_encrypt_key"); 2197 2198 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2199 # AES_KEY *key) 2200 &function_begin_B("${PREFIX}_set_decrypt_key"); 2201 &mov ("eax",&wparam(0)); 2202 &mov ($rounds,&wparam(1)); 2203 &mov ($key,&wparam(2)); 2204 &call ("_aesni_set_encrypt_key"); 2205 &mov ($key,&wparam(2)); 2206 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 2207 &test ("eax","eax"); 2208 &jnz (&label("dec_key_ret")); 2209 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2210 2211 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2212 &$movekey ("xmm1",&QWP(0,"eax")); 2213 &$movekey (&QWP(0,"eax"),"xmm0"); 2214 &$movekey (&QWP(0,$key),"xmm1"); 2215 &lea ($key,&DWP(16,$key)); 2216 &lea ("eax",&DWP(-16,"eax")); 2217 2218 &set_label("dec_key_inverse"); 2219 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2220 &$movekey ("xmm1",&QWP(0,"eax")); 2221 &aesimc ("xmm0","xmm0"); 2222 &aesimc ("xmm1","xmm1"); 2223 &lea ($key,&DWP(16,$key)); 2224 &lea ("eax",&DWP(-16,"eax")); 2225 &$movekey (&QWP(16,"eax"),"xmm0"); 2226 &$movekey (&QWP(-16,$key),"xmm1"); 2227 &cmp ("eax",$key); 2228 &ja (&label("dec_key_inverse")); 2229 2230 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2231 &aesimc ("xmm0","xmm0"); 2232 &$movekey (&QWP(0,$key),"xmm0"); 2233 2234 &xor ("eax","eax"); # return success 2235 &set_label("dec_key_ret"); 2236 &ret (); 2237 &function_end_B("${PREFIX}_set_decrypt_key"); 2238 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2239 2240 &asm_finish(); 2241