1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # Given aes(enc|dec) instructions' latency asymptotic performance for 18 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 19 # processed with 128-bit key. And given their throughput asymptotic 20 # performance for parallelizable modes is 1.25 cycles per byte. Being 21 # asymptotic limit it's not something you commonly achieve in reality, 22 # but how close does one get? Below are results collected for 23 # different modes and block sized. Pairs of numbers are for en-/ 24 # decryption. 25 # 26 # 16-byte 64-byte 256-byte 1-KB 8-KB 27 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 28 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 29 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 30 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 31 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 32 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 33 # 34 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means 35 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 36 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 37 # The results were collected with specially crafted speed.c benchmark 38 # in order to compare them with results reported in "Intel Advanced 39 # Encryption Standard (AES) New Instruction Set" White Paper Revision 40 # 3.0 dated May 2010. All above results are consistently better. This 41 # module also provides better performance for block sizes smaller than 42 # 128 bytes in points *not* represented in the above table. 43 # 44 # Looking at the results for 8-KB buffer. 45 # 46 # CFB and OFB results are far from the limit, because implementation 47 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 48 # single-block aesni_encrypt, which is not the most optimal way to go. 49 # CBC encrypt result is unexpectedly high and there is no documented 50 # explanation for it. Seemingly there is a small penalty for feeding 51 # the result back to AES unit the way it's done in CBC mode. There is 52 # nothing one can do and the result appears optimal. CCM result is 53 # identical to CBC, because CBC-MAC is essentially CBC encrypt without 54 # saving output. CCM CTR "stays invisible," because it's neatly 55 # interleaved wih CBC-MAC. This provides ~30% improvement over 56 # "straghtforward" CCM implementation with CTR and CBC-MAC performed 57 # disjointly. Parallelizable modes practically achieve the theoretical 58 # limit. 59 # 60 # Looking at how results vary with buffer size. 61 # 62 # Curves are practically saturated at 1-KB buffer size. In most cases 63 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 64 # CTR curve doesn't follow this pattern and is "slowest" changing one 65 # with "256-byte" result being 87% of "8-KB." This is because overhead 66 # in CTR mode is most computationally intensive. Small-block CCM 67 # decrypt is slower than encrypt, because first CTR and last CBC-MAC 68 # iterations can't be interleaved. 69 # 70 # Results for 192- and 256-bit keys. 71 # 72 # EVP-free results were observed to scale perfectly with number of 73 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times 74 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 75 # are a tad smaller, because the above mentioned penalty biases all 76 # results by same constant value. In similar way function call 77 # overhead affects small-block performance, as well as OFB and CFB 78 # results. Differences are not large, most common coefficients are 79 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 80 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 81 82 # January 2011 83 # 84 # While Westmere processor features 6 cycles latency for aes[enc|dec] 85 # instructions, which can be scheduled every second cycle, Sandy 86 # Bridge spends 8 cycles per instruction, but it can schedule them 87 # every cycle. This means that code targeting Westmere would perform 88 # suboptimally on Sandy Bridge. Therefore this update. 89 # 90 # In addition, non-parallelizable CBC encrypt (as well as CCM) is 91 # optimized. Relative improvement might appear modest, 8% on Westmere, 92 # but in absolute terms it's 3.77 cycles per byte encrypted with 93 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 94 # should be compared to asymptotic limits of 3.75 for Westmere and 95 # 5.00 for Sandy Bridge. Actually, the fact that they get this close 96 # to asymptotic limits is quite amazing. Indeed, the limit is 97 # calculated as latency times number of rounds, 10 for 128-bit key, 98 # and divided by 16, the number of bytes in block, or in other words 99 # it accounts *solely* for aesenc instructions. But there are extra 100 # instructions, and numbers so close to the asymptotic limits mean 101 # that it's as if it takes as little as *one* additional cycle to 102 # execute all of them. How is it possible? It is possible thanks to 103 # out-of-order execution logic, which manages to overlap post- 104 # processing of previous block, things like saving the output, with 105 # actual encryption of current block, as well as pre-processing of 106 # current block, things like fetching input and xor-ing it with 107 # 0-round element of the key schedule, with actual encryption of 108 # previous block. Keep this in mind... 109 # 110 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 111 # performance is achieved by interleaving instructions working on 112 # independent blocks. In which case asymptotic limit for such modes 113 # can be obtained by dividing above mentioned numbers by AES 114 # instructions' interleave factor. Westmere can execute at most 3 115 # instructions at a time, meaning that optimal interleave factor is 3, 116 # and that's where the "magic" number of 1.25 come from. "Optimal 117 # interleave factor" means that increase of interleave factor does 118 # not improve performance. The formula has proven to reflect reality 119 # pretty well on Westmere... Sandy Bridge on the other hand can 120 # execute up to 8 AES instructions at a time, so how does varying 121 # interleave factor affect the performance? Here is table for ECB 122 # (numbers are cycles per byte processed with 128-bit key): 123 # 124 # instruction interleave factor 3x 6x 8x 125 # theoretical asymptotic limit 1.67 0.83 0.625 126 # measured performance for 8KB block 1.05 0.86 0.84 127 # 128 # "as if" interleave factor 4.7x 5.8x 6.0x 129 # 130 # Further data for other parallelizable modes: 131 # 132 # CBC decrypt 1.16 0.93 0.74 133 # CTR 1.14 0.91 0.74 134 # 135 # Well, given 3x column it's probably inappropriate to call the limit 136 # asymptotic, if it can be surpassed, isn't it? What happens there? 137 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution 138 # magic is responsible for this. Processor overlaps not only the 139 # additional instructions with AES ones, but even AES instuctions 140 # processing adjacent triplets of independent blocks. In the 6x case 141 # additional instructions still claim disproportionally small amount 142 # of additional cycles, but in 8x case number of instructions must be 143 # a tad too high for out-of-order logic to cope with, and AES unit 144 # remains underutilized... As you can see 8x interleave is hardly 145 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 146 # utilizies 6x interleave because of limited register bank capacity. 147 # 148 # Higher interleave factors do have negative impact on Westmere 149 # performance. While for ECB mode it's negligible ~1.5%, other 150 # parallelizables perform ~5% worse, which is outweighed by ~25% 151 # improvement on Sandy Bridge. To balance regression on Westmere 152 # CTR mode was implemented with 6x aesenc interleave factor. 153 154 # April 2011 155 # 156 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 157 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 158 # in CTR mode AES instruction interleave factor was chosen to be 6x. 159 160 ###################################################################### 161 # Current large-block performance in cycles per byte processed with 162 # 128-bit key (less is better). 163 # 164 # CBC en-/decrypt CTR XTS ECB 165 # Westmere 3.77/1.25 1.25 1.25 1.26 166 # * Bridge 5.07/0.74 0.75 0.90 0.85 167 # Haswell 4.44/0.63 0.63 0.73 0.63 168 # Atom 5.75/3.54 3.56 4.12 3.87(*) 169 # Bulldozer 5.77/0.70 0.72 0.90 0.70 170 # 171 # (*) Atom ECB result is suboptimal because of penalties incurred 172 # by operations on %xmm8-15. As ECB is not considered 173 # critical, nothing was done to mitigate the problem. 174 175 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 176 # generates drop-in replacement for 177 # crypto/aes/asm/aes-x86_64.pl:-) 178 179 $flavour = shift; 180 $output = shift; 181 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 182 183 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 184 185 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 186 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 187 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 188 die "can't locate x86_64-xlate.pl"; 189 190 open OUT,"| \"$^X\" $xlate $flavour $output"; 191 *STDOUT=*OUT; 192 193 $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 194 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 195 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 196 197 $code=".text\n"; 198 $code.=".extern OPENSSL_ia32cap_P\n"; 199 200 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 201 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 202 $inp="%rdi"; 203 $out="%rsi"; 204 $len="%rdx"; 205 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 206 $ivp="%r8"; # cbc, ctr, ... 207 208 $rnds_="%r10d"; # backup copy for $rounds 209 $key_="%r11"; # backup copy for $key 210 211 # %xmm register layout 212 $rndkey0="%xmm0"; $rndkey1="%xmm1"; 213 $inout0="%xmm2"; $inout1="%xmm3"; 214 $inout2="%xmm4"; $inout3="%xmm5"; 215 $inout4="%xmm6"; $inout5="%xmm7"; 216 $inout6="%xmm8"; $inout7="%xmm9"; 217 218 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 219 $in0="%xmm8"; $iv="%xmm9"; 220 222 # Inline version of internal aesni_[en|de]crypt1. 223 # 224 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate 225 # cycles which take care of loop variables... 226 { my $sn; 227 sub aesni_generate1 { 228 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 229 ++$sn; 230 $code.=<<___; 231 $movkey ($key),$rndkey0 232 $movkey 16($key),$rndkey1 233 ___ 234 $code.=<<___ if (defined($ivec)); 235 xorps $rndkey0,$ivec 236 lea 32($key),$key 237 xorps $ivec,$inout 238 ___ 239 $code.=<<___ if (!defined($ivec)); 240 lea 32($key),$key 241 xorps $rndkey0,$inout 242 ___ 243 $code.=<<___; 244 .Loop_${p}1_$sn: 245 aes${p} $rndkey1,$inout 246 dec $rounds 247 $movkey ($key),$rndkey1 248 lea 16($key),$key 249 jnz .Loop_${p}1_$sn # loop body is 16 bytes 250 aes${p}last $rndkey1,$inout 251 ___ 252 }} 253 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 254 # 255 { my ($inp,$out,$key) = @_4args; 256 257 $code.=<<___; 258 .globl ${PREFIX}_encrypt 259 .type ${PREFIX}_encrypt,\@abi-omnipotent 260 .align 16 261 ${PREFIX}_encrypt: 262 movups ($inp),$inout0 # load input 263 mov 240($key),$rounds # key->rounds 264 ___ 265 &aesni_generate1("enc",$key,$rounds); 266 $code.=<<___; 267 movups $inout0,($out) # output 268 ret 269 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 270 271 .globl ${PREFIX}_decrypt 272 .type ${PREFIX}_decrypt,\@abi-omnipotent 273 .align 16 274 ${PREFIX}_decrypt: 275 movups ($inp),$inout0 # load input 276 mov 240($key),$rounds # key->rounds 277 ___ 278 &aesni_generate1("dec",$key,$rounds); 279 $code.=<<___; 280 movups $inout0,($out) # output 281 ret 282 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 283 ___ 284 } 285 287 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 288 # factor. Why 3x subroutine were originally used in loops? Even though 289 # aes[enc|dec] latency was originally 6, it could be scheduled only 290 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 291 # utilization, i.e. when subroutine's throughput is virtually same as 292 # of non-interleaved subroutine [for number of input blocks up to 3]. 293 # This is why it originally made no sense to implement 2x subroutine. 294 # But times change and it became appropriate to spend extra 192 bytes 295 # on 2x subroutine on Atom Silvermont account. For processors that 296 # can schedule aes[enc|dec] every cycle optimal interleave factor 297 # equals to corresponding instructions latency. 8x is optimal for 298 # * Bridge and "super-optimal" for other Intel CPUs... 299 300 sub aesni_generate2 { 301 my $dir=shift; 302 # As already mentioned it takes in $key and $rounds, which are *not* 303 # preserved. $inout[0-1] is cipher/clear text... 304 $code.=<<___; 305 .type _aesni_${dir}rypt2,\@abi-omnipotent 306 .align 16 307 _aesni_${dir}rypt2: 308 $movkey ($key),$rndkey0 309 shl \$4,$rounds 310 $movkey 16($key),$rndkey1 311 xorps $rndkey0,$inout0 312 xorps $rndkey0,$inout1 313 $movkey 32($key),$rndkey0 314 lea 32($key,$rounds),$key 315 neg %rax # $rounds 316 add \$16,%rax 317 318 .L${dir}_loop2: 319 aes${dir} $rndkey1,$inout0 320 aes${dir} $rndkey1,$inout1 321 $movkey ($key,%rax),$rndkey1 322 add \$32,%rax 323 aes${dir} $rndkey0,$inout0 324 aes${dir} $rndkey0,$inout1 325 $movkey -16($key,%rax),$rndkey0 326 jnz .L${dir}_loop2 327 328 aes${dir} $rndkey1,$inout0 329 aes${dir} $rndkey1,$inout1 330 aes${dir}last $rndkey0,$inout0 331 aes${dir}last $rndkey0,$inout1 332 ret 333 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 334 ___ 335 } 336 sub aesni_generate3 { 337 my $dir=shift; 338 # As already mentioned it takes in $key and $rounds, which are *not* 339 # preserved. $inout[0-2] is cipher/clear text... 340 $code.=<<___; 341 .type _aesni_${dir}rypt3,\@abi-omnipotent 342 .align 16 343 _aesni_${dir}rypt3: 344 $movkey ($key),$rndkey0 345 shl \$4,$rounds 346 $movkey 16($key),$rndkey1 347 xorps $rndkey0,$inout0 348 xorps $rndkey0,$inout1 349 xorps $rndkey0,$inout2 350 $movkey 32($key),$rndkey0 351 lea 32($key,$rounds),$key 352 neg %rax # $rounds 353 add \$16,%rax 354 355 .L${dir}_loop3: 356 aes${dir} $rndkey1,$inout0 357 aes${dir} $rndkey1,$inout1 358 aes${dir} $rndkey1,$inout2 359 $movkey ($key,%rax),$rndkey1 360 add \$32,%rax 361 aes${dir} $rndkey0,$inout0 362 aes${dir} $rndkey0,$inout1 363 aes${dir} $rndkey0,$inout2 364 $movkey -16($key,%rax),$rndkey0 365 jnz .L${dir}_loop3 366 367 aes${dir} $rndkey1,$inout0 368 aes${dir} $rndkey1,$inout1 369 aes${dir} $rndkey1,$inout2 370 aes${dir}last $rndkey0,$inout0 371 aes${dir}last $rndkey0,$inout1 372 aes${dir}last $rndkey0,$inout2 373 ret 374 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 375 ___ 376 } 377 # 4x interleave is implemented to improve small block performance, 378 # most notably [and naturally] 4 block by ~30%. One can argue that one 379 # should have implemented 5x as well, but improvement would be <20%, 380 # so it's not worth it... 381 sub aesni_generate4 { 382 my $dir=shift; 383 # As already mentioned it takes in $key and $rounds, which are *not* 384 # preserved. $inout[0-3] is cipher/clear text... 385 $code.=<<___; 386 .type _aesni_${dir}rypt4,\@abi-omnipotent 387 .align 16 388 _aesni_${dir}rypt4: 389 $movkey ($key),$rndkey0 390 shl \$4,$rounds 391 $movkey 16($key),$rndkey1 392 xorps $rndkey0,$inout0 393 xorps $rndkey0,$inout1 394 xorps $rndkey0,$inout2 395 xorps $rndkey0,$inout3 396 $movkey 32($key),$rndkey0 397 lea 32($key,$rounds),$key 398 neg %rax # $rounds 399 .byte 0x0f,0x1f,0x00 400 add \$16,%rax 401 402 .L${dir}_loop4: 403 aes${dir} $rndkey1,$inout0 404 aes${dir} $rndkey1,$inout1 405 aes${dir} $rndkey1,$inout2 406 aes${dir} $rndkey1,$inout3 407 $movkey ($key,%rax),$rndkey1 408 add \$32,%rax 409 aes${dir} $rndkey0,$inout0 410 aes${dir} $rndkey0,$inout1 411 aes${dir} $rndkey0,$inout2 412 aes${dir} $rndkey0,$inout3 413 $movkey -16($key,%rax),$rndkey0 414 jnz .L${dir}_loop4 415 416 aes${dir} $rndkey1,$inout0 417 aes${dir} $rndkey1,$inout1 418 aes${dir} $rndkey1,$inout2 419 aes${dir} $rndkey1,$inout3 420 aes${dir}last $rndkey0,$inout0 421 aes${dir}last $rndkey0,$inout1 422 aes${dir}last $rndkey0,$inout2 423 aes${dir}last $rndkey0,$inout3 424 ret 425 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 426 ___ 427 } 428 sub aesni_generate6 { 429 my $dir=shift; 430 # As already mentioned it takes in $key and $rounds, which are *not* 431 # preserved. $inout[0-5] is cipher/clear text... 432 $code.=<<___; 433 .type _aesni_${dir}rypt6,\@abi-omnipotent 434 .align 16 435 _aesni_${dir}rypt6: 436 $movkey ($key),$rndkey0 437 shl \$4,$rounds 438 $movkey 16($key),$rndkey1 439 xorps $rndkey0,$inout0 440 pxor $rndkey0,$inout1 441 pxor $rndkey0,$inout2 442 aes${dir} $rndkey1,$inout0 443 lea 32($key,$rounds),$key 444 neg %rax # $rounds 445 aes${dir} $rndkey1,$inout1 446 pxor $rndkey0,$inout3 447 pxor $rndkey0,$inout4 448 aes${dir} $rndkey1,$inout2 449 pxor $rndkey0,$inout5 450 add \$16,%rax 451 aes${dir} $rndkey1,$inout3 452 aes${dir} $rndkey1,$inout4 453 aes${dir} $rndkey1,$inout5 454 $movkey -16($key,%rax),$rndkey0 455 jmp .L${dir}_loop6_enter 456 .align 16 457 .L${dir}_loop6: 458 aes${dir} $rndkey1,$inout0 459 aes${dir} $rndkey1,$inout1 460 aes${dir} $rndkey1,$inout2 461 aes${dir} $rndkey1,$inout3 462 aes${dir} $rndkey1,$inout4 463 aes${dir} $rndkey1,$inout5 464 .L${dir}_loop6_enter: 465 $movkey ($key,%rax),$rndkey1 466 add \$32,%rax 467 aes${dir} $rndkey0,$inout0 468 aes${dir} $rndkey0,$inout1 469 aes${dir} $rndkey0,$inout2 470 aes${dir} $rndkey0,$inout3 471 aes${dir} $rndkey0,$inout4 472 aes${dir} $rndkey0,$inout5 473 $movkey -16($key,%rax),$rndkey0 474 jnz .L${dir}_loop6 475 476 aes${dir} $rndkey1,$inout0 477 aes${dir} $rndkey1,$inout1 478 aes${dir} $rndkey1,$inout2 479 aes${dir} $rndkey1,$inout3 480 aes${dir} $rndkey1,$inout4 481 aes${dir} $rndkey1,$inout5 482 aes${dir}last $rndkey0,$inout0 483 aes${dir}last $rndkey0,$inout1 484 aes${dir}last $rndkey0,$inout2 485 aes${dir}last $rndkey0,$inout3 486 aes${dir}last $rndkey0,$inout4 487 aes${dir}last $rndkey0,$inout5 488 ret 489 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 490 ___ 491 } 492 sub aesni_generate8 { 493 my $dir=shift; 494 # As already mentioned it takes in $key and $rounds, which are *not* 495 # preserved. $inout[0-7] is cipher/clear text... 496 $code.=<<___; 497 .type _aesni_${dir}rypt8,\@abi-omnipotent 498 .align 16 499 _aesni_${dir}rypt8: 500 $movkey ($key),$rndkey0 501 shl \$4,$rounds 502 $movkey 16($key),$rndkey1 503 xorps $rndkey0,$inout0 504 xorps $rndkey0,$inout1 505 pxor $rndkey0,$inout2 506 pxor $rndkey0,$inout3 507 pxor $rndkey0,$inout4 508 lea 32($key,$rounds),$key 509 neg %rax # $rounds 510 aes${dir} $rndkey1,$inout0 511 add \$16,%rax 512 pxor $rndkey0,$inout5 513 aes${dir} $rndkey1,$inout1 514 pxor $rndkey0,$inout6 515 pxor $rndkey0,$inout7 516 aes${dir} $rndkey1,$inout2 517 aes${dir} $rndkey1,$inout3 518 aes${dir} $rndkey1,$inout4 519 aes${dir} $rndkey1,$inout5 520 aes${dir} $rndkey1,$inout6 521 aes${dir} $rndkey1,$inout7 522 $movkey -16($key,%rax),$rndkey0 523 jmp .L${dir}_loop8_enter 524 .align 16 525 .L${dir}_loop8: 526 aes${dir} $rndkey1,$inout0 527 aes${dir} $rndkey1,$inout1 528 aes${dir} $rndkey1,$inout2 529 aes${dir} $rndkey1,$inout3 530 aes${dir} $rndkey1,$inout4 531 aes${dir} $rndkey1,$inout5 532 aes${dir} $rndkey1,$inout6 533 aes${dir} $rndkey1,$inout7 534 .L${dir}_loop8_enter: 535 $movkey ($key,%rax),$rndkey1 536 add \$32,%rax 537 aes${dir} $rndkey0,$inout0 538 aes${dir} $rndkey0,$inout1 539 aes${dir} $rndkey0,$inout2 540 aes${dir} $rndkey0,$inout3 541 aes${dir} $rndkey0,$inout4 542 aes${dir} $rndkey0,$inout5 543 aes${dir} $rndkey0,$inout6 544 aes${dir} $rndkey0,$inout7 545 $movkey -16($key,%rax),$rndkey0 546 jnz .L${dir}_loop8 547 548 aes${dir} $rndkey1,$inout0 549 aes${dir} $rndkey1,$inout1 550 aes${dir} $rndkey1,$inout2 551 aes${dir} $rndkey1,$inout3 552 aes${dir} $rndkey1,$inout4 553 aes${dir} $rndkey1,$inout5 554 aes${dir} $rndkey1,$inout6 555 aes${dir} $rndkey1,$inout7 556 aes${dir}last $rndkey0,$inout0 557 aes${dir}last $rndkey0,$inout1 558 aes${dir}last $rndkey0,$inout2 559 aes${dir}last $rndkey0,$inout3 560 aes${dir}last $rndkey0,$inout4 561 aes${dir}last $rndkey0,$inout5 562 aes${dir}last $rndkey0,$inout6 563 aes${dir}last $rndkey0,$inout7 564 ret 565 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 566 ___ 567 } 568 &aesni_generate2("enc") if ($PREFIX eq "aesni"); 569 &aesni_generate2("dec"); 570 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 571 &aesni_generate3("dec"); 572 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 573 &aesni_generate4("dec"); 574 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 575 &aesni_generate6("dec"); 576 &aesni_generate8("enc") if ($PREFIX eq "aesni"); 577 &aesni_generate8("dec"); 578 580 if ($PREFIX eq "aesni") { 581 ######################################################################## 582 # void aesni_ecb_encrypt (const void *in, void *out, 583 # size_t length, const AES_KEY *key, 584 # int enc); 585 $code.=<<___; 586 .globl aesni_ecb_encrypt 587 .type aesni_ecb_encrypt,\@function,5 588 .align 16 589 aesni_ecb_encrypt: 590 and \$-16,$len 591 jz .Lecb_ret 592 593 mov 240($key),$rounds # key->rounds 594 $movkey ($key),$rndkey0 595 mov $key,$key_ # backup $key 596 mov $rounds,$rnds_ # backup $rounds 597 test %r8d,%r8d # 5th argument 598 jz .Lecb_decrypt 599 #--------------------------- ECB ENCRYPT ------------------------------# 600 cmp \$0x80,$len 601 jb .Lecb_enc_tail 602 603 movdqu ($inp),$inout0 604 movdqu 0x10($inp),$inout1 605 movdqu 0x20($inp),$inout2 606 movdqu 0x30($inp),$inout3 607 movdqu 0x40($inp),$inout4 608 movdqu 0x50($inp),$inout5 609 movdqu 0x60($inp),$inout6 610 movdqu 0x70($inp),$inout7 611 lea 0x80($inp),$inp 612 sub \$0x80,$len 613 jmp .Lecb_enc_loop8_enter 614 .align 16 615 .Lecb_enc_loop8: 616 movups $inout0,($out) 617 mov $key_,$key # restore $key 618 movdqu ($inp),$inout0 619 mov $rnds_,$rounds # restore $rounds 620 movups $inout1,0x10($out) 621 movdqu 0x10($inp),$inout1 622 movups $inout2,0x20($out) 623 movdqu 0x20($inp),$inout2 624 movups $inout3,0x30($out) 625 movdqu 0x30($inp),$inout3 626 movups $inout4,0x40($out) 627 movdqu 0x40($inp),$inout4 628 movups $inout5,0x50($out) 629 movdqu 0x50($inp),$inout5 630 movups $inout6,0x60($out) 631 movdqu 0x60($inp),$inout6 632 movups $inout7,0x70($out) 633 lea 0x80($out),$out 634 movdqu 0x70($inp),$inout7 635 lea 0x80($inp),$inp 636 .Lecb_enc_loop8_enter: 637 638 call _aesni_encrypt8 639 640 sub \$0x80,$len 641 jnc .Lecb_enc_loop8 642 643 movups $inout0,($out) 644 mov $key_,$key # restore $key 645 movups $inout1,0x10($out) 646 mov $rnds_,$rounds # restore $rounds 647 movups $inout2,0x20($out) 648 movups $inout3,0x30($out) 649 movups $inout4,0x40($out) 650 movups $inout5,0x50($out) 651 movups $inout6,0x60($out) 652 movups $inout7,0x70($out) 653 lea 0x80($out),$out 654 add \$0x80,$len 655 jz .Lecb_ret 656 657 .Lecb_enc_tail: 658 movups ($inp),$inout0 659 cmp \$0x20,$len 660 jb .Lecb_enc_one 661 movups 0x10($inp),$inout1 662 je .Lecb_enc_two 663 movups 0x20($inp),$inout2 664 cmp \$0x40,$len 665 jb .Lecb_enc_three 666 movups 0x30($inp),$inout3 667 je .Lecb_enc_four 668 movups 0x40($inp),$inout4 669 cmp \$0x60,$len 670 jb .Lecb_enc_five 671 movups 0x50($inp),$inout5 672 je .Lecb_enc_six 673 movdqu 0x60($inp),$inout6 674 call _aesni_encrypt8 675 movups $inout0,($out) 676 movups $inout1,0x10($out) 677 movups $inout2,0x20($out) 678 movups $inout3,0x30($out) 679 movups $inout4,0x40($out) 680 movups $inout5,0x50($out) 681 movups $inout6,0x60($out) 682 jmp .Lecb_ret 683 .align 16 684 .Lecb_enc_one: 685 ___ 686 &aesni_generate1("enc",$key,$rounds); 687 $code.=<<___; 688 movups $inout0,($out) 689 jmp .Lecb_ret 690 .align 16 691 .Lecb_enc_two: 692 call _aesni_encrypt2 693 movups $inout0,($out) 694 movups $inout1,0x10($out) 695 jmp .Lecb_ret 696 .align 16 697 .Lecb_enc_three: 698 call _aesni_encrypt3 699 movups $inout0,($out) 700 movups $inout1,0x10($out) 701 movups $inout2,0x20($out) 702 jmp .Lecb_ret 703 .align 16 704 .Lecb_enc_four: 705 call _aesni_encrypt4 706 movups $inout0,($out) 707 movups $inout1,0x10($out) 708 movups $inout2,0x20($out) 709 movups $inout3,0x30($out) 710 jmp .Lecb_ret 711 .align 16 712 .Lecb_enc_five: 713 xorps $inout5,$inout5 714 call _aesni_encrypt6 715 movups $inout0,($out) 716 movups $inout1,0x10($out) 717 movups $inout2,0x20($out) 718 movups $inout3,0x30($out) 719 movups $inout4,0x40($out) 720 jmp .Lecb_ret 721 .align 16 722 .Lecb_enc_six: 723 call _aesni_encrypt6 724 movups $inout0,($out) 725 movups $inout1,0x10($out) 726 movups $inout2,0x20($out) 727 movups $inout3,0x30($out) 728 movups $inout4,0x40($out) 729 movups $inout5,0x50($out) 730 jmp .Lecb_ret 731 #--------------------------- ECB DECRYPT ------------------------------# 733 .align 16 734 .Lecb_decrypt: 735 cmp \$0x80,$len 736 jb .Lecb_dec_tail 737 738 movdqu ($inp),$inout0 739 movdqu 0x10($inp),$inout1 740 movdqu 0x20($inp),$inout2 741 movdqu 0x30($inp),$inout3 742 movdqu 0x40($inp),$inout4 743 movdqu 0x50($inp),$inout5 744 movdqu 0x60($inp),$inout6 745 movdqu 0x70($inp),$inout7 746 lea 0x80($inp),$inp 747 sub \$0x80,$len 748 jmp .Lecb_dec_loop8_enter 749 .align 16 750 .Lecb_dec_loop8: 751 movups $inout0,($out) 752 mov $key_,$key # restore $key 753 movdqu ($inp),$inout0 754 mov $rnds_,$rounds # restore $rounds 755 movups $inout1,0x10($out) 756 movdqu 0x10($inp),$inout1 757 movups $inout2,0x20($out) 758 movdqu 0x20($inp),$inout2 759 movups $inout3,0x30($out) 760 movdqu 0x30($inp),$inout3 761 movups $inout4,0x40($out) 762 movdqu 0x40($inp),$inout4 763 movups $inout5,0x50($out) 764 movdqu 0x50($inp),$inout5 765 movups $inout6,0x60($out) 766 movdqu 0x60($inp),$inout6 767 movups $inout7,0x70($out) 768 lea 0x80($out),$out 769 movdqu 0x70($inp),$inout7 770 lea 0x80($inp),$inp 771 .Lecb_dec_loop8_enter: 772 773 call _aesni_decrypt8 774 775 $movkey ($key_),$rndkey0 776 sub \$0x80,$len 777 jnc .Lecb_dec_loop8 778 779 movups $inout0,($out) 780 mov $key_,$key # restore $key 781 movups $inout1,0x10($out) 782 mov $rnds_,$rounds # restore $rounds 783 movups $inout2,0x20($out) 784 movups $inout3,0x30($out) 785 movups $inout4,0x40($out) 786 movups $inout5,0x50($out) 787 movups $inout6,0x60($out) 788 movups $inout7,0x70($out) 789 lea 0x80($out),$out 790 add \$0x80,$len 791 jz .Lecb_ret 792 793 .Lecb_dec_tail: 794 movups ($inp),$inout0 795 cmp \$0x20,$len 796 jb .Lecb_dec_one 797 movups 0x10($inp),$inout1 798 je .Lecb_dec_two 799 movups 0x20($inp),$inout2 800 cmp \$0x40,$len 801 jb .Lecb_dec_three 802 movups 0x30($inp),$inout3 803 je .Lecb_dec_four 804 movups 0x40($inp),$inout4 805 cmp \$0x60,$len 806 jb .Lecb_dec_five 807 movups 0x50($inp),$inout5 808 je .Lecb_dec_six 809 movups 0x60($inp),$inout6 810 $movkey ($key),$rndkey0 811 call _aesni_decrypt8 812 movups $inout0,($out) 813 movups $inout1,0x10($out) 814 movups $inout2,0x20($out) 815 movups $inout3,0x30($out) 816 movups $inout4,0x40($out) 817 movups $inout5,0x50($out) 818 movups $inout6,0x60($out) 819 jmp .Lecb_ret 820 .align 16 821 .Lecb_dec_one: 822 ___ 823 &aesni_generate1("dec",$key,$rounds); 824 $code.=<<___; 825 movups $inout0,($out) 826 jmp .Lecb_ret 827 .align 16 828 .Lecb_dec_two: 829 call _aesni_decrypt2 830 movups $inout0,($out) 831 movups $inout1,0x10($out) 832 jmp .Lecb_ret 833 .align 16 834 .Lecb_dec_three: 835 call _aesni_decrypt3 836 movups $inout0,($out) 837 movups $inout1,0x10($out) 838 movups $inout2,0x20($out) 839 jmp .Lecb_ret 840 .align 16 841 .Lecb_dec_four: 842 call _aesni_decrypt4 843 movups $inout0,($out) 844 movups $inout1,0x10($out) 845 movups $inout2,0x20($out) 846 movups $inout3,0x30($out) 847 jmp .Lecb_ret 848 .align 16 849 .Lecb_dec_five: 850 xorps $inout5,$inout5 851 call _aesni_decrypt6 852 movups $inout0,($out) 853 movups $inout1,0x10($out) 854 movups $inout2,0x20($out) 855 movups $inout3,0x30($out) 856 movups $inout4,0x40($out) 857 jmp .Lecb_ret 858 .align 16 859 .Lecb_dec_six: 860 call _aesni_decrypt6 861 movups $inout0,($out) 862 movups $inout1,0x10($out) 863 movups $inout2,0x20($out) 864 movups $inout3,0x30($out) 865 movups $inout4,0x40($out) 866 movups $inout5,0x50($out) 867 868 .Lecb_ret: 869 ret 870 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt 871 ___ 872 874 { 875 ###################################################################### 876 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 877 # size_t blocks, const AES_KEY *key, 878 # const char *ivec,char *cmac); 879 # 880 # Handles only complete blocks, operates on 64-bit counter and 881 # does not update *ivec! Nor does it finalize CMAC value 882 # (see engine/eng_aesni.c for details) 883 # 884 { 885 my $cmac="%r9"; # 6th argument 886 887 my $increment="%xmm9"; 888 my $iv="%xmm6"; 889 my $bswap_mask="%xmm7"; 890 891 $code.=<<___; 892 .globl aesni_ccm64_encrypt_blocks 893 .type aesni_ccm64_encrypt_blocks,\@function,6 894 .align 16 895 aesni_ccm64_encrypt_blocks: 896 ___ 897 $code.=<<___ if ($win64); 898 lea -0x58(%rsp),%rsp 899 movaps %xmm6,(%rsp) 900 movaps %xmm7,0x10(%rsp) 901 movaps %xmm8,0x20(%rsp) 902 movaps %xmm9,0x30(%rsp) 903 .Lccm64_enc_body: 904 ___ 905 $code.=<<___; 906 mov 240($key),$rounds # key->rounds 907 movdqu ($ivp),$iv 908 movdqa .Lincrement64(%rip),$increment 909 movdqa .Lbswap_mask(%rip),$bswap_mask 910 911 shl \$4,$rounds 912 mov \$16,$rnds_ 913 lea 0($key),$key_ 914 movdqu ($cmac),$inout1 915 movdqa $iv,$inout0 916 lea 32($key,$rounds),$key # end of key schedule 917 pshufb $bswap_mask,$iv 918 sub %rax,%r10 # twisted $rounds 919 jmp .Lccm64_enc_outer 920 .align 16 921 .Lccm64_enc_outer: 922 $movkey ($key_),$rndkey0 923 mov %r10,%rax 924 movups ($inp),$in0 # load inp 925 926 xorps $rndkey0,$inout0 # counter 927 $movkey 16($key_),$rndkey1 928 xorps $in0,$rndkey0 929 xorps $rndkey0,$inout1 # cmac^=inp 930 $movkey 32($key_),$rndkey0 931 932 .Lccm64_enc2_loop: 933 aesenc $rndkey1,$inout0 934 aesenc $rndkey1,$inout1 935 $movkey ($key,%rax),$rndkey1 936 add \$32,%rax 937 aesenc $rndkey0,$inout0 938 aesenc $rndkey0,$inout1 939 $movkey -16($key,%rax),$rndkey0 940 jnz .Lccm64_enc2_loop 941 aesenc $rndkey1,$inout0 942 aesenc $rndkey1,$inout1 943 paddq $increment,$iv 944 dec $len 945 aesenclast $rndkey0,$inout0 946 aesenclast $rndkey0,$inout1 947 948 lea 16($inp),$inp 949 xorps $inout0,$in0 # inp ^= E(iv) 950 movdqa $iv,$inout0 951 movups $in0,($out) # save output 952 pshufb $bswap_mask,$inout0 953 lea 16($out),$out 954 jnz .Lccm64_enc_outer 955 956 movups $inout1,($cmac) 957 ___ 958 $code.=<<___ if ($win64); 959 movaps (%rsp),%xmm6 960 movaps 0x10(%rsp),%xmm7 961 movaps 0x20(%rsp),%xmm8 962 movaps 0x30(%rsp),%xmm9 963 lea 0x58(%rsp),%rsp 964 .Lccm64_enc_ret: 965 ___ 966 $code.=<<___; 967 ret 968 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 969 ___ 970 ###################################################################### 971 $code.=<<___; 972 .globl aesni_ccm64_decrypt_blocks 973 .type aesni_ccm64_decrypt_blocks,\@function,6 974 .align 16 975 aesni_ccm64_decrypt_blocks: 976 ___ 977 $code.=<<___ if ($win64); 978 lea -0x58(%rsp),%rsp 979 movaps %xmm6,(%rsp) 980 movaps %xmm7,0x10(%rsp) 981 movaps %xmm8,0x20(%rsp) 982 movaps %xmm9,0x30(%rsp) 983 .Lccm64_dec_body: 984 ___ 985 $code.=<<___; 986 mov 240($key),$rounds # key->rounds 987 movups ($ivp),$iv 988 movdqu ($cmac),$inout1 989 movdqa .Lincrement64(%rip),$increment 990 movdqa .Lbswap_mask(%rip),$bswap_mask 991 992 movaps $iv,$inout0 993 mov $rounds,$rnds_ 994 mov $key,$key_ 995 pshufb $bswap_mask,$iv 996 ___ 997 &aesni_generate1("enc",$key,$rounds); 998 $code.=<<___; 999 shl \$4,$rnds_ 1000 mov \$16,$rounds 1001 movups ($inp),$in0 # load inp 1002 paddq $increment,$iv 1003 lea 16($inp),$inp 1004 sub %r10,%rax # twisted $rounds 1005 lea 32($key_,$rnds_),$key # end of key schedule 1006 mov %rax,%r10 1007 jmp .Lccm64_dec_outer 1008 .align 16 1009 .Lccm64_dec_outer: 1010 xorps $inout0,$in0 # inp ^= E(iv) 1011 movdqa $iv,$inout0 1012 movups $in0,($out) # save output 1013 lea 16($out),$out 1014 pshufb $bswap_mask,$inout0 1015 1016 sub \$1,$len 1017 jz .Lccm64_dec_break 1018 1019 $movkey ($key_),$rndkey0 1020 mov %r10,%rax 1021 $movkey 16($key_),$rndkey1 1022 xorps $rndkey0,$in0 1023 xorps $rndkey0,$inout0 1024 xorps $in0,$inout1 # cmac^=out 1025 $movkey 32($key_),$rndkey0 1026 jmp .Lccm64_dec2_loop 1027 .align 16 1028 .Lccm64_dec2_loop: 1029 aesenc $rndkey1,$inout0 1030 aesenc $rndkey1,$inout1 1031 $movkey ($key,%rax),$rndkey1 1032 add \$32,%rax 1033 aesenc $rndkey0,$inout0 1034 aesenc $rndkey0,$inout1 1035 $movkey -16($key,%rax),$rndkey0 1036 jnz .Lccm64_dec2_loop 1037 movups ($inp),$in0 # load inp 1038 paddq $increment,$iv 1039 aesenc $rndkey1,$inout0 1040 aesenc $rndkey1,$inout1 1041 aesenclast $rndkey0,$inout0 1042 aesenclast $rndkey0,$inout1 1043 lea 16($inp),$inp 1044 jmp .Lccm64_dec_outer 1045 1046 .align 16 1047 .Lccm64_dec_break: 1048 #xorps $in0,$inout1 # cmac^=out 1049 mov 240($key_),$rounds 1050 ___ 1051 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1052 $code.=<<___; 1053 movups $inout1,($cmac) 1054 ___ 1055 $code.=<<___ if ($win64); 1056 movaps (%rsp),%xmm6 1057 movaps 0x10(%rsp),%xmm7 1058 movaps 0x20(%rsp),%xmm8 1059 movaps 0x30(%rsp),%xmm9 1060 lea 0x58(%rsp),%rsp 1061 .Lccm64_dec_ret: 1062 ___ 1063 $code.=<<___; 1064 ret 1065 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1066 ___ 1067 } 1069 ###################################################################### 1070 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1071 # size_t blocks, const AES_KEY *key, 1072 # const char *ivec); 1073 # 1074 # Handles only complete blocks, operates on 32-bit counter and 1075 # does not update *ivec! (see crypto/modes/ctr128.c for details) 1076 # 1077 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1078 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1079 # Keywords are full unroll and modulo-schedule counter calculations 1080 # with zero-round key xor. 1081 { 1082 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1083 my ($key0,$ctr)=("${key_}d","${ivp}d"); 1084 my $frame_size = 0x80 + ($win64?160:0); 1085 1086 $code.=<<___; 1087 .globl aesni_ctr32_encrypt_blocks 1088 .type aesni_ctr32_encrypt_blocks,\@function,5 1089 .align 16 1090 aesni_ctr32_encrypt_blocks: 1091 lea (%rsp),%rax 1092 push %rbp 1093 sub \$$frame_size,%rsp 1094 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1095 ___ 1096 $code.=<<___ if ($win64); 1097 movaps %xmm6,-0xa8(%rax) 1098 movaps %xmm7,-0x98(%rax) 1099 movaps %xmm8,-0x88(%rax) 1100 movaps %xmm9,-0x78(%rax) 1101 movaps %xmm10,-0x68(%rax) 1102 movaps %xmm11,-0x58(%rax) 1103 movaps %xmm12,-0x48(%rax) 1104 movaps %xmm13,-0x38(%rax) 1105 movaps %xmm14,-0x28(%rax) 1106 movaps %xmm15,-0x18(%rax) 1107 .Lctr32_body: 1108 ___ 1109 $code.=<<___; 1110 lea -8(%rax),%rbp 1111 1112 cmp \$1,$len 1113 je .Lctr32_one_shortcut 1114 1115 movdqu ($ivp),$inout0 1116 movdqu ($key),$rndkey0 1117 mov 12($ivp),$ctr # counter LSB 1118 pxor $rndkey0,$inout0 1119 mov 12($key),$key0 # 0-round key LSB 1120 movdqa $inout0,0x00(%rsp) # populate counter block 1121 bswap $ctr 1122 movdqa $inout0,$inout1 1123 movdqa $inout0,$inout2 1124 movdqa $inout0,$inout3 1125 movdqa $inout0,0x40(%rsp) 1126 movdqa $inout0,0x50(%rsp) 1127 movdqa $inout0,0x60(%rsp) 1128 mov %rdx,%r10 # borrow %rdx 1129 movdqa $inout0,0x70(%rsp) 1130 1131 lea 1($ctr),%rax 1132 lea 2($ctr),%rdx 1133 bswap %eax 1134 bswap %edx 1135 xor $key0,%eax 1136 xor $key0,%edx 1137 pinsrd \$3,%eax,$inout1 1138 lea 3($ctr),%rax 1139 movdqa $inout1,0x10(%rsp) 1140 pinsrd \$3,%edx,$inout2 1141 bswap %eax 1142 mov %r10,%rdx # restore %rdx 1143 lea 4($ctr),%r10 1144 movdqa $inout2,0x20(%rsp) 1145 xor $key0,%eax 1146 bswap %r10d 1147 pinsrd \$3,%eax,$inout3 1148 xor $key0,%r10d 1149 movdqa $inout3,0x30(%rsp) 1150 lea 5($ctr),%r9 1151 mov %r10d,0x40+12(%rsp) 1152 bswap %r9d 1153 lea 6($ctr),%r10 1154 mov 240($key),$rounds # key->rounds 1155 xor $key0,%r9d 1156 bswap %r10d 1157 mov %r9d,0x50+12(%rsp) 1158 xor $key0,%r10d 1159 lea 7($ctr),%r9 1160 mov %r10d,0x60+12(%rsp) 1161 bswap %r9d 1162 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1163 xor $key0,%r9d 1164 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1165 mov %r9d,0x70+12(%rsp) 1166 1167 $movkey 0x10($key),$rndkey1 1168 1169 movdqa 0x40(%rsp),$inout4 1170 movdqa 0x50(%rsp),$inout5 1171 1172 cmp \$8,$len 1173 jb .Lctr32_tail 1174 1175 sub \$6,$len 1176 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1177 je .Lctr32_6x 1178 1179 lea 0x80($key),$key # size optimization 1180 sub \$2,$len 1181 jmp .Lctr32_loop8 1182 1183 .align 16 1184 .Lctr32_6x: 1185 shl \$4,$rounds 1186 mov \$48,$rnds_ 1187 bswap $key0 1188 lea 32($key,$rounds),$key # end of key schedule 1189 sub %rax,%r10 # twisted $rounds 1190 jmp .Lctr32_loop6 1191 1192 .align 16 1193 .Lctr32_loop6: 1194 add \$6,$ctr 1195 $movkey -48($key,$rnds_),$rndkey0 1196 aesenc $rndkey1,$inout0 1197 mov $ctr,%eax 1198 xor $key0,%eax 1199 aesenc $rndkey1,$inout1 1200 movbe %eax,`0x00+12`(%rsp) 1201 lea 1($ctr),%eax 1202 aesenc $rndkey1,$inout2 1203 xor $key0,%eax 1204 movbe %eax,`0x10+12`(%rsp) 1205 aesenc $rndkey1,$inout3 1206 lea 2($ctr),%eax 1207 xor $key0,%eax 1208 aesenc $rndkey1,$inout4 1209 movbe %eax,`0x20+12`(%rsp) 1210 lea 3($ctr),%eax 1211 aesenc $rndkey1,$inout5 1212 $movkey -32($key,$rnds_),$rndkey1 1213 xor $key0,%eax 1214 1215 aesenc $rndkey0,$inout0 1216 movbe %eax,`0x30+12`(%rsp) 1217 lea 4($ctr),%eax 1218 aesenc $rndkey0,$inout1 1219 xor $key0,%eax 1220 movbe %eax,`0x40+12`(%rsp) 1221 aesenc $rndkey0,$inout2 1222 lea 5($ctr),%eax 1223 xor $key0,%eax 1224 aesenc $rndkey0,$inout3 1225 movbe %eax,`0x50+12`(%rsp) 1226 mov %r10,%rax # mov $rnds_,$rounds 1227 aesenc $rndkey0,$inout4 1228 aesenc $rndkey0,$inout5 1229 $movkey -16($key,$rnds_),$rndkey0 1230 1231 call .Lenc_loop6 1232 1233 movdqu ($inp),$inout6 1234 movdqu 0x10($inp),$inout7 1235 movdqu 0x20($inp),$in0 1236 movdqu 0x30($inp),$in1 1237 movdqu 0x40($inp),$in2 1238 movdqu 0x50($inp),$in3 1239 lea 0x60($inp),$inp 1240 $movkey -64($key,$rnds_),$rndkey1 1241 pxor $inout0,$inout6 1242 movaps 0x00(%rsp),$inout0 1243 pxor $inout1,$inout7 1244 movaps 0x10(%rsp),$inout1 1245 pxor $inout2,$in0 1246 movaps 0x20(%rsp),$inout2 1247 pxor $inout3,$in1 1248 movaps 0x30(%rsp),$inout3 1249 pxor $inout4,$in2 1250 movaps 0x40(%rsp),$inout4 1251 pxor $inout5,$in3 1252 movaps 0x50(%rsp),$inout5 1253 movdqu $inout6,($out) 1254 movdqu $inout7,0x10($out) 1255 movdqu $in0,0x20($out) 1256 movdqu $in1,0x30($out) 1257 movdqu $in2,0x40($out) 1258 movdqu $in3,0x50($out) 1259 lea 0x60($out),$out 1260 1261 sub \$6,$len 1262 jnc .Lctr32_loop6 1263 1264 add \$6,$len 1265 jz .Lctr32_done 1266 1267 lea -48($rnds_),$rounds 1268 lea -80($key,$rnds_),$key # restore $key 1269 neg $rounds 1270 shr \$4,$rounds # restore $rounds 1271 jmp .Lctr32_tail 1272 1273 .align 32 1274 .Lctr32_loop8: 1275 add \$8,$ctr 1276 movdqa 0x60(%rsp),$inout6 1277 aesenc $rndkey1,$inout0 1278 mov $ctr,%r9d 1279 movdqa 0x70(%rsp),$inout7 1280 aesenc $rndkey1,$inout1 1281 bswap %r9d 1282 $movkey 0x20-0x80($key),$rndkey0 1283 aesenc $rndkey1,$inout2 1284 xor $key0,%r9d 1285 nop 1286 aesenc $rndkey1,$inout3 1287 mov %r9d,0x00+12(%rsp) 1288 lea 1($ctr),%r9 1289 aesenc $rndkey1,$inout4 1290 aesenc $rndkey1,$inout5 1291 aesenc $rndkey1,$inout6 1292 aesenc $rndkey1,$inout7 1293 $movkey 0x30-0x80($key),$rndkey1 1294 ___ 1295 for($i=2;$i<8;$i++) { 1296 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1297 $code.=<<___; 1298 bswap %r9d 1299 aesenc $rndkeyx,$inout0 1300 aesenc $rndkeyx,$inout1 1301 xor $key0,%r9d 1302 .byte 0x66,0x90 1303 aesenc $rndkeyx,$inout2 1304 aesenc $rndkeyx,$inout3 1305 mov %r9d,`0x10*($i-1)`+12(%rsp) 1306 lea $i($ctr),%r9 1307 aesenc $rndkeyx,$inout4 1308 aesenc $rndkeyx,$inout5 1309 aesenc $rndkeyx,$inout6 1310 aesenc $rndkeyx,$inout7 1311 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1312 ___ 1313 } 1314 $code.=<<___; 1315 bswap %r9d 1316 aesenc $rndkey0,$inout0 1317 aesenc $rndkey0,$inout1 1318 aesenc $rndkey0,$inout2 1319 xor $key0,%r9d 1320 movdqu 0x00($inp),$in0 1321 aesenc $rndkey0,$inout3 1322 mov %r9d,0x70+12(%rsp) 1323 cmp \$11,$rounds 1324 aesenc $rndkey0,$inout4 1325 aesenc $rndkey0,$inout5 1326 aesenc $rndkey0,$inout6 1327 aesenc $rndkey0,$inout7 1328 $movkey 0xa0-0x80($key),$rndkey0 1329 1330 jb .Lctr32_enc_done 1331 1332 aesenc $rndkey1,$inout0 1333 aesenc $rndkey1,$inout1 1334 aesenc $rndkey1,$inout2 1335 aesenc $rndkey1,$inout3 1336 aesenc $rndkey1,$inout4 1337 aesenc $rndkey1,$inout5 1338 aesenc $rndkey1,$inout6 1339 aesenc $rndkey1,$inout7 1340 $movkey 0xb0-0x80($key),$rndkey1 1341 1342 aesenc $rndkey0,$inout0 1343 aesenc $rndkey0,$inout1 1344 aesenc $rndkey0,$inout2 1345 aesenc $rndkey0,$inout3 1346 aesenc $rndkey0,$inout4 1347 aesenc $rndkey0,$inout5 1348 aesenc $rndkey0,$inout6 1349 aesenc $rndkey0,$inout7 1350 $movkey 0xc0-0x80($key),$rndkey0 1351 je .Lctr32_enc_done 1352 1353 aesenc $rndkey1,$inout0 1354 aesenc $rndkey1,$inout1 1355 aesenc $rndkey1,$inout2 1356 aesenc $rndkey1,$inout3 1357 aesenc $rndkey1,$inout4 1358 aesenc $rndkey1,$inout5 1359 aesenc $rndkey1,$inout6 1360 aesenc $rndkey1,$inout7 1361 $movkey 0xd0-0x80($key),$rndkey1 1362 1363 aesenc $rndkey0,$inout0 1364 aesenc $rndkey0,$inout1 1365 aesenc $rndkey0,$inout2 1366 aesenc $rndkey0,$inout3 1367 aesenc $rndkey0,$inout4 1368 aesenc $rndkey0,$inout5 1369 aesenc $rndkey0,$inout6 1370 aesenc $rndkey0,$inout7 1371 $movkey 0xe0-0x80($key),$rndkey0 1372 jmp .Lctr32_enc_done 1373 1374 .align 16 1375 .Lctr32_enc_done: 1376 movdqu 0x10($inp),$in1 1377 pxor $rndkey0,$in0 1378 movdqu 0x20($inp),$in2 1379 pxor $rndkey0,$in1 1380 movdqu 0x30($inp),$in3 1381 pxor $rndkey0,$in2 1382 movdqu 0x40($inp),$in4 1383 pxor $rndkey0,$in3 1384 movdqu 0x50($inp),$in5 1385 pxor $rndkey0,$in4 1386 pxor $rndkey0,$in5 1387 aesenc $rndkey1,$inout0 1388 aesenc $rndkey1,$inout1 1389 aesenc $rndkey1,$inout2 1390 aesenc $rndkey1,$inout3 1391 aesenc $rndkey1,$inout4 1392 aesenc $rndkey1,$inout5 1393 aesenc $rndkey1,$inout6 1394 aesenc $rndkey1,$inout7 1395 movdqu 0x60($inp),$rndkey1 1396 lea 0x80($inp),$inp 1397 1398 aesenclast $in0,$inout0 1399 pxor $rndkey0,$rndkey1 1400 movdqu 0x70-0x80($inp),$in0 1401 aesenclast $in1,$inout1 1402 pxor $rndkey0,$in0 1403 movdqa 0x00(%rsp),$in1 # load next counter block 1404 aesenclast $in2,$inout2 1405 aesenclast $in3,$inout3 1406 movdqa 0x10(%rsp),$in2 1407 movdqa 0x20(%rsp),$in3 1408 aesenclast $in4,$inout4 1409 aesenclast $in5,$inout5 1410 movdqa 0x30(%rsp),$in4 1411 movdqa 0x40(%rsp),$in5 1412 aesenclast $rndkey1,$inout6 1413 movdqa 0x50(%rsp),$rndkey0 1414 $movkey 0x10-0x80($key),$rndkey1 1415 aesenclast $in0,$inout7 1416 1417 movups $inout0,($out) # store output 1418 movdqa $in1,$inout0 1419 movups $inout1,0x10($out) 1420 movdqa $in2,$inout1 1421 movups $inout2,0x20($out) 1422 movdqa $in3,$inout2 1423 movups $inout3,0x30($out) 1424 movdqa $in4,$inout3 1425 movups $inout4,0x40($out) 1426 movdqa $in5,$inout4 1427 movups $inout5,0x50($out) 1428 movdqa $rndkey0,$inout5 1429 movups $inout6,0x60($out) 1430 movups $inout7,0x70($out) 1431 lea 0x80($out),$out 1432 1433 sub \$8,$len 1434 jnc .Lctr32_loop8 1435 1436 add \$8,$len 1437 jz .Lctr32_done 1438 lea -0x80($key),$key 1439 1440 .Lctr32_tail: 1441 lea 16($key),$key 1442 cmp \$4,$len 1443 jb .Lctr32_loop3 1444 je .Lctr32_loop4 1445 1446 shl \$4,$rounds 1447 movdqa 0x60(%rsp),$inout6 1448 pxor $inout7,$inout7 1449 1450 $movkey 16($key),$rndkey0 1451 aesenc $rndkey1,$inout0 1452 aesenc $rndkey1,$inout1 1453 lea 32-16($key,$rounds),$key 1454 neg %rax 1455 aesenc $rndkey1,$inout2 1456 add \$16,%rax 1457 movups ($inp),$in0 1458 aesenc $rndkey1,$inout3 1459 aesenc $rndkey1,$inout4 1460 movups 0x10($inp),$in1 1461 movups 0x20($inp),$in2 1462 aesenc $rndkey1,$inout5 1463 aesenc $rndkey1,$inout6 1464 1465 call .Lenc_loop8_enter 1466 1467 movdqu 0x30($inp),$in3 1468 pxor $in0,$inout0 1469 movdqu 0x40($inp),$in0 1470 pxor $in1,$inout1 1471 movdqu $inout0,($out) 1472 pxor $in2,$inout2 1473 movdqu $inout1,0x10($out) 1474 pxor $in3,$inout3 1475 movdqu $inout2,0x20($out) 1476 pxor $in0,$inout4 1477 movdqu $inout3,0x30($out) 1478 movdqu $inout4,0x40($out) 1479 cmp \$6,$len 1480 jb .Lctr32_done 1481 1482 movups 0x50($inp),$in1 1483 xorps $in1,$inout5 1484 movups $inout5,0x50($out) 1485 je .Lctr32_done 1486 1487 movups 0x60($inp),$in2 1488 xorps $in2,$inout6 1489 movups $inout6,0x60($out) 1490 jmp .Lctr32_done 1491 1492 .align 32 1493 .Lctr32_loop4: 1494 aesenc $rndkey1,$inout0 1495 lea 16($key),$key 1496 dec $rounds 1497 aesenc $rndkey1,$inout1 1498 aesenc $rndkey1,$inout2 1499 aesenc $rndkey1,$inout3 1500 $movkey ($key),$rndkey1 1501 jnz .Lctr32_loop4 1502 aesenclast $rndkey1,$inout0 1503 aesenclast $rndkey1,$inout1 1504 movups ($inp),$in0 1505 movups 0x10($inp),$in1 1506 aesenclast $rndkey1,$inout2 1507 aesenclast $rndkey1,$inout3 1508 movups 0x20($inp),$in2 1509 movups 0x30($inp),$in3 1510 1511 xorps $in0,$inout0 1512 movups $inout0,($out) 1513 xorps $in1,$inout1 1514 movups $inout1,0x10($out) 1515 pxor $in2,$inout2 1516 movdqu $inout2,0x20($out) 1517 pxor $in3,$inout3 1518 movdqu $inout3,0x30($out) 1519 jmp .Lctr32_done 1520 1521 .align 32 1522 .Lctr32_loop3: 1523 aesenc $rndkey1,$inout0 1524 lea 16($key),$key 1525 dec $rounds 1526 aesenc $rndkey1,$inout1 1527 aesenc $rndkey1,$inout2 1528 $movkey ($key),$rndkey1 1529 jnz .Lctr32_loop3 1530 aesenclast $rndkey1,$inout0 1531 aesenclast $rndkey1,$inout1 1532 aesenclast $rndkey1,$inout2 1533 1534 movups ($inp),$in0 1535 xorps $in0,$inout0 1536 movups $inout0,($out) 1537 cmp \$2,$len 1538 jb .Lctr32_done 1539 1540 movups 0x10($inp),$in1 1541 xorps $in1,$inout1 1542 movups $inout1,0x10($out) 1543 je .Lctr32_done 1544 1545 movups 0x20($inp),$in2 1546 xorps $in2,$inout2 1547 movups $inout2,0x20($out) 1548 jmp .Lctr32_done 1549 1550 .align 16 1551 .Lctr32_one_shortcut: 1552 movups ($ivp),$inout0 1553 movups ($inp),$in0 1554 mov 240($key),$rounds # key->rounds 1555 ___ 1556 &aesni_generate1("enc",$key,$rounds); 1557 $code.=<<___; 1558 xorps $in0,$inout0 1559 movups $inout0,($out) 1560 jmp .Lctr32_done 1561 1562 .align 16 1563 .Lctr32_done: 1564 ___ 1565 $code.=<<___ if ($win64); 1566 movaps -0xa0(%rbp),%xmm6 1567 movaps -0x90(%rbp),%xmm7 1568 movaps -0x80(%rbp),%xmm8 1569 movaps -0x70(%rbp),%xmm9 1570 movaps -0x60(%rbp),%xmm10 1571 movaps -0x50(%rbp),%xmm11 1572 movaps -0x40(%rbp),%xmm12 1573 movaps -0x30(%rbp),%xmm13 1574 movaps -0x20(%rbp),%xmm14 1575 movaps -0x10(%rbp),%xmm15 1576 ___ 1577 $code.=<<___; 1578 lea (%rbp),%rsp 1579 pop %rbp 1580 .Lctr32_epilogue: 1581 ret 1582 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1583 ___ 1584 } 1585 1587 ###################################################################### 1588 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1589 # const AES_KEY *key1, const AES_KEY *key2 1590 # const unsigned char iv[16]); 1591 # 1592 { 1593 my @tweak=map("%xmm$_",(10..15)); 1594 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1595 my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1596 my $frame_size = 0x70 + ($win64?160:0); 1597 1598 $code.=<<___; 1599 .globl aesni_xts_encrypt 1600 .type aesni_xts_encrypt,\@function,6 1601 .align 16 1602 aesni_xts_encrypt: 1603 lea (%rsp),%rax 1604 push %rbp 1605 sub \$$frame_size,%rsp 1606 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1607 ___ 1608 $code.=<<___ if ($win64); 1609 movaps %xmm6,-0xa8(%rax) 1610 movaps %xmm7,-0x98(%rax) 1611 movaps %xmm8,-0x88(%rax) 1612 movaps %xmm9,-0x78(%rax) 1613 movaps %xmm10,-0x68(%rax) 1614 movaps %xmm11,-0x58(%rax) 1615 movaps %xmm12,-0x48(%rax) 1616 movaps %xmm13,-0x38(%rax) 1617 movaps %xmm14,-0x28(%rax) 1618 movaps %xmm15,-0x18(%rax) 1619 .Lxts_enc_body: 1620 ___ 1621 $code.=<<___; 1622 lea -8(%rax),%rbp 1623 movups ($ivp),$inout0 # load clear-text tweak 1624 mov 240(%r8),$rounds # key2->rounds 1625 mov 240($key),$rnds_ # key1->rounds 1626 ___ 1627 # generate the tweak 1628 &aesni_generate1("enc",$key2,$rounds,$inout0); 1629 $code.=<<___; 1630 $movkey ($key),$rndkey0 # zero round key 1631 mov $key,$key_ # backup $key 1632 mov $rnds_,$rounds # backup $rounds 1633 shl \$4,$rnds_ 1634 mov $len,$len_ # backup $len 1635 and \$-16,$len 1636 1637 $movkey 16($key,$rnds_),$rndkey1 # last round key 1638 1639 movdqa .Lxts_magic(%rip),$twmask 1640 movdqa $inout0,@tweak[5] 1641 pshufd \$0x5f,$inout0,$twres 1642 pxor $rndkey0,$rndkey1 1643 ___ 1644 # alternative tweak calculation algorithm is based on suggestions 1645 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1646 # and should help in the future... 1647 for ($i=0;$i<4;$i++) { 1648 $code.=<<___; 1649 movdqa $twres,$twtmp 1650 paddd $twres,$twres 1651 movdqa @tweak[5],@tweak[$i] 1652 psrad \$31,$twtmp # broadcast upper bits 1653 paddq @tweak[5],@tweak[5] 1654 pand $twmask,$twtmp 1655 pxor $rndkey0,@tweak[$i] 1656 pxor $twtmp,@tweak[5] 1657 ___ 1658 } 1659 $code.=<<___; 1660 movdqa @tweak[5],@tweak[4] 1661 psrad \$31,$twres 1662 paddq @tweak[5],@tweak[5] 1663 pand $twmask,$twres 1664 pxor $rndkey0,@tweak[4] 1665 pxor $twres,@tweak[5] 1666 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1667 1668 sub \$16*6,$len 1669 jc .Lxts_enc_short 1670 1671 mov \$16+96,$rounds 1672 lea 32($key_,$rnds_),$key # end of key schedule 1673 sub %r10,%rax # twisted $rounds 1674 $movkey 16($key_),$rndkey1 1675 mov %rax,%r10 # backup twisted $rounds 1676 lea .Lxts_magic(%rip),%r8 1677 jmp .Lxts_enc_grandloop 1678 1679 .align 32 1680 .Lxts_enc_grandloop: 1681 movdqu `16*0`($inp),$inout0 # load input 1682 movdqa $rndkey0,$twmask 1683 movdqu `16*1`($inp),$inout1 1684 pxor @tweak[0],$inout0 1685 movdqu `16*2`($inp),$inout2 1686 pxor @tweak[1],$inout1 1687 aesenc $rndkey1,$inout0 1688 movdqu `16*3`($inp),$inout3 1689 pxor @tweak[2],$inout2 1690 aesenc $rndkey1,$inout1 1691 movdqu `16*4`($inp),$inout4 1692 pxor @tweak[3],$inout3 1693 aesenc $rndkey1,$inout2 1694 movdqu `16*5`($inp),$inout5 1695 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1696 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1697 pxor @tweak[4],$inout4 1698 aesenc $rndkey1,$inout3 1699 $movkey 32($key_),$rndkey0 1700 lea `16*6`($inp),$inp 1701 pxor $twmask,$inout5 1702 1703 pxor $twres,@tweak[0] 1704 aesenc $rndkey1,$inout4 1705 pxor $twres,@tweak[1] 1706 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 1707 aesenc $rndkey1,$inout5 1708 $movkey 48($key_),$rndkey1 1709 pxor $twres,@tweak[2] 1710 1711 aesenc $rndkey0,$inout0 1712 pxor $twres,@tweak[3] 1713 movdqa @tweak[1],`16*1`(%rsp) 1714 aesenc $rndkey0,$inout1 1715 pxor $twres,@tweak[4] 1716 movdqa @tweak[2],`16*2`(%rsp) 1717 aesenc $rndkey0,$inout2 1718 aesenc $rndkey0,$inout3 1719 pxor $twres,$twmask 1720 movdqa @tweak[4],`16*4`(%rsp) 1721 aesenc $rndkey0,$inout4 1722 aesenc $rndkey0,$inout5 1723 $movkey 64($key_),$rndkey0 1724 movdqa $twmask,`16*5`(%rsp) 1725 pshufd \$0x5f,@tweak[5],$twres 1726 jmp .Lxts_enc_loop6 1727 .align 32 1728 .Lxts_enc_loop6: 1729 aesenc $rndkey1,$inout0 1730 aesenc $rndkey1,$inout1 1731 aesenc $rndkey1,$inout2 1732 aesenc $rndkey1,$inout3 1733 aesenc $rndkey1,$inout4 1734 aesenc $rndkey1,$inout5 1735 $movkey -64($key,%rax),$rndkey1 1736 add \$32,%rax 1737 1738 aesenc $rndkey0,$inout0 1739 aesenc $rndkey0,$inout1 1740 aesenc $rndkey0,$inout2 1741 aesenc $rndkey0,$inout3 1742 aesenc $rndkey0,$inout4 1743 aesenc $rndkey0,$inout5 1744 $movkey -80($key,%rax),$rndkey0 1745 jnz .Lxts_enc_loop6 1746 1747 movdqa (%r8),$twmask 1748 movdqa $twres,$twtmp 1749 paddd $twres,$twres 1750 aesenc $rndkey1,$inout0 1751 paddq @tweak[5],@tweak[5] 1752 psrad \$31,$twtmp 1753 aesenc $rndkey1,$inout1 1754 pand $twmask,$twtmp 1755 $movkey ($key_),@tweak[0] # load round[0] 1756 aesenc $rndkey1,$inout2 1757 aesenc $rndkey1,$inout3 1758 aesenc $rndkey1,$inout4 1759 pxor $twtmp,@tweak[5] 1760 movaps @tweak[0],@tweak[1] # copy round[0] 1761 aesenc $rndkey1,$inout5 1762 $movkey -64($key),$rndkey1 1763 1764 movdqa $twres,$twtmp 1765 aesenc $rndkey0,$inout0 1766 paddd $twres,$twres 1767 pxor @tweak[5],@tweak[0] 1768 aesenc $rndkey0,$inout1 1769 psrad \$31,$twtmp 1770 paddq @tweak[5],@tweak[5] 1771 aesenc $rndkey0,$inout2 1772 aesenc $rndkey0,$inout3 1773 pand $twmask,$twtmp 1774 movaps @tweak[1],@tweak[2] 1775 aesenc $rndkey0,$inout4 1776 pxor $twtmp,@tweak[5] 1777 movdqa $twres,$twtmp 1778 aesenc $rndkey0,$inout5 1779 $movkey -48($key),$rndkey0 1780 1781 paddd $twres,$twres 1782 aesenc $rndkey1,$inout0 1783 pxor @tweak[5],@tweak[1] 1784 psrad \$31,$twtmp 1785 aesenc $rndkey1,$inout1 1786 paddq @tweak[5],@tweak[5] 1787 pand $twmask,$twtmp 1788 aesenc $rndkey1,$inout2 1789 aesenc $rndkey1,$inout3 1790 movdqa @tweak[3],`16*3`(%rsp) 1791 pxor $twtmp,@tweak[5] 1792 aesenc $rndkey1,$inout4 1793 movaps @tweak[2],@tweak[3] 1794 movdqa $twres,$twtmp 1795 aesenc $rndkey1,$inout5 1796 $movkey -32($key),$rndkey1 1797 1798 paddd $twres,$twres 1799 aesenc $rndkey0,$inout0 1800 pxor @tweak[5],@tweak[2] 1801 psrad \$31,$twtmp 1802 aesenc $rndkey0,$inout1 1803 paddq @tweak[5],@tweak[5] 1804 pand $twmask,$twtmp 1805 aesenc $rndkey0,$inout2 1806 aesenc $rndkey0,$inout3 1807 aesenc $rndkey0,$inout4 1808 pxor $twtmp,@tweak[5] 1809 movaps @tweak[3],@tweak[4] 1810 aesenc $rndkey0,$inout5 1811 1812 movdqa $twres,$rndkey0 1813 paddd $twres,$twres 1814 aesenc $rndkey1,$inout0 1815 pxor @tweak[5],@tweak[3] 1816 psrad \$31,$rndkey0 1817 aesenc $rndkey1,$inout1 1818 paddq @tweak[5],@tweak[5] 1819 pand $twmask,$rndkey0 1820 aesenc $rndkey1,$inout2 1821 aesenc $rndkey1,$inout3 1822 pxor $rndkey0,@tweak[5] 1823 $movkey ($key_),$rndkey0 1824 aesenc $rndkey1,$inout4 1825 aesenc $rndkey1,$inout5 1826 $movkey 16($key_),$rndkey1 1827 1828 pxor @tweak[5],@tweak[4] 1829 aesenclast `16*0`(%rsp),$inout0 1830 psrad \$31,$twres 1831 paddq @tweak[5],@tweak[5] 1832 aesenclast `16*1`(%rsp),$inout1 1833 aesenclast `16*2`(%rsp),$inout2 1834 pand $twmask,$twres 1835 mov %r10,%rax # restore $rounds 1836 aesenclast `16*3`(%rsp),$inout3 1837 aesenclast `16*4`(%rsp),$inout4 1838 aesenclast `16*5`(%rsp),$inout5 1839 pxor $twres,@tweak[5] 1840 1841 lea `16*6`($out),$out 1842 movups $inout0,`-16*6`($out) # write output 1843 movups $inout1,`-16*5`($out) 1844 movups $inout2,`-16*4`($out) 1845 movups $inout3,`-16*3`($out) 1846 movups $inout4,`-16*2`($out) 1847 movups $inout5,`-16*1`($out) 1848 sub \$16*6,$len 1849 jnc .Lxts_enc_grandloop 1850 1851 mov \$16+96,$rounds 1852 sub $rnds_,$rounds 1853 mov $key_,$key # restore $key 1854 shr \$4,$rounds # restore original value 1855 1856 .Lxts_enc_short: 1857 mov $rounds,$rnds_ # backup $rounds 1858 pxor $rndkey0,@tweak[0] 1859 add \$16*6,$len 1860 jz .Lxts_enc_done 1861 1862 pxor $rndkey0,@tweak[1] 1863 cmp \$0x20,$len 1864 jb .Lxts_enc_one 1865 pxor $rndkey0,@tweak[2] 1866 je .Lxts_enc_two 1867 1868 pxor $rndkey0,@tweak[3] 1869 cmp \$0x40,$len 1870 jb .Lxts_enc_three 1871 pxor $rndkey0,@tweak[4] 1872 je .Lxts_enc_four 1873 1874 movdqu ($inp),$inout0 1875 movdqu 16*1($inp),$inout1 1876 movdqu 16*2($inp),$inout2 1877 pxor @tweak[0],$inout0 1878 movdqu 16*3($inp),$inout3 1879 pxor @tweak[1],$inout1 1880 movdqu 16*4($inp),$inout4 1881 lea 16*5($inp),$inp 1882 pxor @tweak[2],$inout2 1883 pxor @tweak[3],$inout3 1884 pxor @tweak[4],$inout4 1885 1886 call _aesni_encrypt6 1887 1888 xorps @tweak[0],$inout0 1889 movdqa @tweak[5],@tweak[0] 1890 xorps @tweak[1],$inout1 1891 xorps @tweak[2],$inout2 1892 movdqu $inout0,($out) 1893 xorps @tweak[3],$inout3 1894 movdqu $inout1,16*1($out) 1895 xorps @tweak[4],$inout4 1896 movdqu $inout2,16*2($out) 1897 movdqu $inout3,16*3($out) 1898 movdqu $inout4,16*4($out) 1899 lea 16*5($out),$out 1900 jmp .Lxts_enc_done 1901 1902 .align 16 1903 .Lxts_enc_one: 1904 movups ($inp),$inout0 1905 lea 16*1($inp),$inp 1906 xorps @tweak[0],$inout0 1907 ___ 1908 &aesni_generate1("enc",$key,$rounds); 1909 $code.=<<___; 1910 xorps @tweak[0],$inout0 1911 movdqa @tweak[1],@tweak[0] 1912 movups $inout0,($out) 1913 lea 16*1($out),$out 1914 jmp .Lxts_enc_done 1915 1916 .align 16 1917 .Lxts_enc_two: 1918 movups ($inp),$inout0 1919 movups 16($inp),$inout1 1920 lea 32($inp),$inp 1921 xorps @tweak[0],$inout0 1922 xorps @tweak[1],$inout1 1923 1924 call _aesni_encrypt2 1925 1926 xorps @tweak[0],$inout0 1927 movdqa @tweak[2],@tweak[0] 1928 xorps @tweak[1],$inout1 1929 movups $inout0,($out) 1930 movups $inout1,16*1($out) 1931 lea 16*2($out),$out 1932 jmp .Lxts_enc_done 1933 1934 .align 16 1935 .Lxts_enc_three: 1936 movups ($inp),$inout0 1937 movups 16*1($inp),$inout1 1938 movups 16*2($inp),$inout2 1939 lea 16*3($inp),$inp 1940 xorps @tweak[0],$inout0 1941 xorps @tweak[1],$inout1 1942 xorps @tweak[2],$inout2 1943 1944 call _aesni_encrypt3 1945 1946 xorps @tweak[0],$inout0 1947 movdqa @tweak[3],@tweak[0] 1948 xorps @tweak[1],$inout1 1949 xorps @tweak[2],$inout2 1950 movups $inout0,($out) 1951 movups $inout1,16*1($out) 1952 movups $inout2,16*2($out) 1953 lea 16*3($out),$out 1954 jmp .Lxts_enc_done 1955 1956 .align 16 1957 .Lxts_enc_four: 1958 movups ($inp),$inout0 1959 movups 16*1($inp),$inout1 1960 movups 16*2($inp),$inout2 1961 xorps @tweak[0],$inout0 1962 movups 16*3($inp),$inout3 1963 lea 16*4($inp),$inp 1964 xorps @tweak[1],$inout1 1965 xorps @tweak[2],$inout2 1966 xorps @tweak[3],$inout3 1967 1968 call _aesni_encrypt4 1969 1970 pxor @tweak[0],$inout0 1971 movdqa @tweak[4],@tweak[0] 1972 pxor @tweak[1],$inout1 1973 pxor @tweak[2],$inout2 1974 movdqu $inout0,($out) 1975 pxor @tweak[3],$inout3 1976 movdqu $inout1,16*1($out) 1977 movdqu $inout2,16*2($out) 1978 movdqu $inout3,16*3($out) 1979 lea 16*4($out),$out 1980 jmp .Lxts_enc_done 1981 1982 .align 16 1983 .Lxts_enc_done: 1984 and \$15,$len_ 1985 jz .Lxts_enc_ret 1986 mov $len_,$len 1987 1988 .Lxts_enc_steal: 1989 movzb ($inp),%eax # borrow $rounds ... 1990 movzb -16($out),%ecx # ... and $key 1991 lea 1($inp),$inp 1992 mov %al,-16($out) 1993 mov %cl,0($out) 1994 lea 1($out),$out 1995 sub \$1,$len 1996 jnz .Lxts_enc_steal 1997 1998 sub $len_,$out # rewind $out 1999 mov $key_,$key # restore $key 2000 mov $rnds_,$rounds # restore $rounds 2001 2002 movups -16($out),$inout0 2003 xorps @tweak[0],$inout0 2004 ___ 2005 &aesni_generate1("enc",$key,$rounds); 2006 $code.=<<___; 2007 xorps @tweak[0],$inout0 2008 movups $inout0,-16($out) 2009 2010 .Lxts_enc_ret: 2011 ___ 2012 $code.=<<___ if ($win64); 2013 movaps -0xa0(%rbp),%xmm6 2014 movaps -0x90(%rbp),%xmm7 2015 movaps -0x80(%rbp),%xmm8 2016 movaps -0x70(%rbp),%xmm9 2017 movaps -0x60(%rbp),%xmm10 2018 movaps -0x50(%rbp),%xmm11 2019 movaps -0x40(%rbp),%xmm12 2020 movaps -0x30(%rbp),%xmm13 2021 movaps -0x20(%rbp),%xmm14 2022 movaps -0x10(%rbp),%xmm15 2023 ___ 2024 $code.=<<___; 2025 lea (%rbp),%rsp 2026 pop %rbp 2027 .Lxts_enc_epilogue: 2028 ret 2029 .size aesni_xts_encrypt,.-aesni_xts_encrypt 2030 ___ 2031 2032 $code.=<<___; 2033 .globl aesni_xts_decrypt 2034 .type aesni_xts_decrypt,\@function,6 2035 .align 16 2036 aesni_xts_decrypt: 2037 lea (%rsp),%rax 2038 push %rbp 2039 sub \$$frame_size,%rsp 2040 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2041 ___ 2042 $code.=<<___ if ($win64); 2043 movaps %xmm6,-0xa8(%rax) 2044 movaps %xmm7,-0x98(%rax) 2045 movaps %xmm8,-0x88(%rax) 2046 movaps %xmm9,-0x78(%rax) 2047 movaps %xmm10,-0x68(%rax) 2048 movaps %xmm11,-0x58(%rax) 2049 movaps %xmm12,-0x48(%rax) 2050 movaps %xmm13,-0x38(%rax) 2051 movaps %xmm14,-0x28(%rax) 2052 movaps %xmm15,-0x18(%rax) 2053 .Lxts_dec_body: 2054 ___ 2055 $code.=<<___; 2056 lea -8(%rax),%rbp 2057 movups ($ivp),$inout0 # load clear-text tweak 2058 mov 240($key2),$rounds # key2->rounds 2059 mov 240($key),$rnds_ # key1->rounds 2060 ___ 2061 # generate the tweak 2062 &aesni_generate1("enc",$key2,$rounds,$inout0); 2063 $code.=<<___; 2064 xor %eax,%eax # if ($len%16) len-=16; 2065 test \$15,$len 2066 setnz %al 2067 shl \$4,%rax 2068 sub %rax,$len 2069 2070 $movkey ($key),$rndkey0 # zero round key 2071 mov $key,$key_ # backup $key 2072 mov $rnds_,$rounds # backup $rounds 2073 shl \$4,$rnds_ 2074 mov $len,$len_ # backup $len 2075 and \$-16,$len 2076 2077 $movkey 16($key,$rnds_),$rndkey1 # last round key 2078 2079 movdqa .Lxts_magic(%rip),$twmask 2080 movdqa $inout0,@tweak[5] 2081 pshufd \$0x5f,$inout0,$twres 2082 pxor $rndkey0,$rndkey1 2083 ___ 2084 for ($i=0;$i<4;$i++) { 2085 $code.=<<___; 2086 movdqa $twres,$twtmp 2087 paddd $twres,$twres 2088 movdqa @tweak[5],@tweak[$i] 2089 psrad \$31,$twtmp # broadcast upper bits 2090 paddq @tweak[5],@tweak[5] 2091 pand $twmask,$twtmp 2092 pxor $rndkey0,@tweak[$i] 2093 pxor $twtmp,@tweak[5] 2094 ___ 2095 } 2096 $code.=<<___; 2097 movdqa @tweak[5],@tweak[4] 2098 psrad \$31,$twres 2099 paddq @tweak[5],@tweak[5] 2100 pand $twmask,$twres 2101 pxor $rndkey0,@tweak[4] 2102 pxor $twres,@tweak[5] 2103 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2104 2105 sub \$16*6,$len 2106 jc .Lxts_dec_short 2107 2108 mov \$16+96,$rounds 2109 lea 32($key_,$rnds_),$key # end of key schedule 2110 sub %r10,%rax # twisted $rounds 2111 $movkey 16($key_),$rndkey1 2112 mov %rax,%r10 # backup twisted $rounds 2113 lea .Lxts_magic(%rip),%r8 2114 jmp .Lxts_dec_grandloop 2115 2116 .align 32 2117 .Lxts_dec_grandloop: 2118 movdqu `16*0`($inp),$inout0 # load input 2119 movdqa $rndkey0,$twmask 2120 movdqu `16*1`($inp),$inout1 2121 pxor @tweak[0],$inout0 2122 movdqu `16*2`($inp),$inout2 2123 pxor @tweak[1],$inout1 2124 aesdec $rndkey1,$inout0 2125 movdqu `16*3`($inp),$inout3 2126 pxor @tweak[2],$inout2 2127 aesdec $rndkey1,$inout1 2128 movdqu `16*4`($inp),$inout4 2129 pxor @tweak[3],$inout3 2130 aesdec $rndkey1,$inout2 2131 movdqu `16*5`($inp),$inout5 2132 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2133 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2134 pxor @tweak[4],$inout4 2135 aesdec $rndkey1,$inout3 2136 $movkey 32($key_),$rndkey0 2137 lea `16*6`($inp),$inp 2138 pxor $twmask,$inout5 2139 2140 pxor $twres,@tweak[0] 2141 aesdec $rndkey1,$inout4 2142 pxor $twres,@tweak[1] 2143 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2144 aesdec $rndkey1,$inout5 2145 $movkey 48($key_),$rndkey1 2146 pxor $twres,@tweak[2] 2147 2148 aesdec $rndkey0,$inout0 2149 pxor $twres,@tweak[3] 2150 movdqa @tweak[1],`16*1`(%rsp) 2151 aesdec $rndkey0,$inout1 2152 pxor $twres,@tweak[4] 2153 movdqa @tweak[2],`16*2`(%rsp) 2154 aesdec $rndkey0,$inout2 2155 aesdec $rndkey0,$inout3 2156 pxor $twres,$twmask 2157 movdqa @tweak[4],`16*4`(%rsp) 2158 aesdec $rndkey0,$inout4 2159 aesdec $rndkey0,$inout5 2160 $movkey 64($key_),$rndkey0 2161 movdqa $twmask,`16*5`(%rsp) 2162 pshufd \$0x5f,@tweak[5],$twres 2163 jmp .Lxts_dec_loop6 2164 .align 32 2165 .Lxts_dec_loop6: 2166 aesdec $rndkey1,$inout0 2167 aesdec $rndkey1,$inout1 2168 aesdec $rndkey1,$inout2 2169 aesdec $rndkey1,$inout3 2170 aesdec $rndkey1,$inout4 2171 aesdec $rndkey1,$inout5 2172 $movkey -64($key,%rax),$rndkey1 2173 add \$32,%rax 2174 2175 aesdec $rndkey0,$inout0 2176 aesdec $rndkey0,$inout1 2177 aesdec $rndkey0,$inout2 2178 aesdec $rndkey0,$inout3 2179 aesdec $rndkey0,$inout4 2180 aesdec $rndkey0,$inout5 2181 $movkey -80($key,%rax),$rndkey0 2182 jnz .Lxts_dec_loop6 2183 2184 movdqa (%r8),$twmask 2185 movdqa $twres,$twtmp 2186 paddd $twres,$twres 2187 aesdec $rndkey1,$inout0 2188 paddq @tweak[5],@tweak[5] 2189 psrad \$31,$twtmp 2190 aesdec $rndkey1,$inout1 2191 pand $twmask,$twtmp 2192 $movkey ($key_),@tweak[0] # load round[0] 2193 aesdec $rndkey1,$inout2 2194 aesdec $rndkey1,$inout3 2195 aesdec $rndkey1,$inout4 2196 pxor $twtmp,@tweak[5] 2197 movaps @tweak[0],@tweak[1] # copy round[0] 2198 aesdec $rndkey1,$inout5 2199 $movkey -64($key),$rndkey1 2200 2201 movdqa $twres,$twtmp 2202 aesdec $rndkey0,$inout0 2203 paddd $twres,$twres 2204 pxor @tweak[5],@tweak[0] 2205 aesdec $rndkey0,$inout1 2206 psrad \$31,$twtmp 2207 paddq @tweak[5],@tweak[5] 2208 aesdec $rndkey0,$inout2 2209 aesdec $rndkey0,$inout3 2210 pand $twmask,$twtmp 2211 movaps @tweak[1],@tweak[2] 2212 aesdec $rndkey0,$inout4 2213 pxor $twtmp,@tweak[5] 2214 movdqa $twres,$twtmp 2215 aesdec $rndkey0,$inout5 2216 $movkey -48($key),$rndkey0 2217 2218 paddd $twres,$twres 2219 aesdec $rndkey1,$inout0 2220 pxor @tweak[5],@tweak[1] 2221 psrad \$31,$twtmp 2222 aesdec $rndkey1,$inout1 2223 paddq @tweak[5],@tweak[5] 2224 pand $twmask,$twtmp 2225 aesdec $rndkey1,$inout2 2226 aesdec $rndkey1,$inout3 2227 movdqa @tweak[3],`16*3`(%rsp) 2228 pxor $twtmp,@tweak[5] 2229 aesdec $rndkey1,$inout4 2230 movaps @tweak[2],@tweak[3] 2231 movdqa $twres,$twtmp 2232 aesdec $rndkey1,$inout5 2233 $movkey -32($key),$rndkey1 2234 2235 paddd $twres,$twres 2236 aesdec $rndkey0,$inout0 2237 pxor @tweak[5],@tweak[2] 2238 psrad \$31,$twtmp 2239 aesdec $rndkey0,$inout1 2240 paddq @tweak[5],@tweak[5] 2241 pand $twmask,$twtmp 2242 aesdec $rndkey0,$inout2 2243 aesdec $rndkey0,$inout3 2244 aesdec $rndkey0,$inout4 2245 pxor $twtmp,@tweak[5] 2246 movaps @tweak[3],@tweak[4] 2247 aesdec $rndkey0,$inout5 2248 2249 movdqa $twres,$rndkey0 2250 paddd $twres,$twres 2251 aesdec $rndkey1,$inout0 2252 pxor @tweak[5],@tweak[3] 2253 psrad \$31,$rndkey0 2254 aesdec $rndkey1,$inout1 2255 paddq @tweak[5],@tweak[5] 2256 pand $twmask,$rndkey0 2257 aesdec $rndkey1,$inout2 2258 aesdec $rndkey1,$inout3 2259 pxor $rndkey0,@tweak[5] 2260 $movkey ($key_),$rndkey0 2261 aesdec $rndkey1,$inout4 2262 aesdec $rndkey1,$inout5 2263 $movkey 16($key_),$rndkey1 2264 2265 pxor @tweak[5],@tweak[4] 2266 aesdeclast `16*0`(%rsp),$inout0 2267 psrad \$31,$twres 2268 paddq @tweak[5],@tweak[5] 2269 aesdeclast `16*1`(%rsp),$inout1 2270 aesdeclast `16*2`(%rsp),$inout2 2271 pand $twmask,$twres 2272 mov %r10,%rax # restore $rounds 2273 aesdeclast `16*3`(%rsp),$inout3 2274 aesdeclast `16*4`(%rsp),$inout4 2275 aesdeclast `16*5`(%rsp),$inout5 2276 pxor $twres,@tweak[5] 2277 2278 lea `16*6`($out),$out 2279 movups $inout0,`-16*6`($out) # write output 2280 movups $inout1,`-16*5`($out) 2281 movups $inout2,`-16*4`($out) 2282 movups $inout3,`-16*3`($out) 2283 movups $inout4,`-16*2`($out) 2284 movups $inout5,`-16*1`($out) 2285 sub \$16*6,$len 2286 jnc .Lxts_dec_grandloop 2287 2288 mov \$16+96,$rounds 2289 sub $rnds_,$rounds 2290 mov $key_,$key # restore $key 2291 shr \$4,$rounds # restore original value 2292 2293 .Lxts_dec_short: 2294 mov $rounds,$rnds_ # backup $rounds 2295 pxor $rndkey0,@tweak[0] 2296 pxor $rndkey0,@tweak[1] 2297 add \$16*6,$len 2298 jz .Lxts_dec_done 2299 2300 pxor $rndkey0,@tweak[2] 2301 cmp \$0x20,$len 2302 jb .Lxts_dec_one 2303 pxor $rndkey0,@tweak[3] 2304 je .Lxts_dec_two 2305 2306 pxor $rndkey0,@tweak[4] 2307 cmp \$0x40,$len 2308 jb .Lxts_dec_three 2309 je .Lxts_dec_four 2310 2311 movdqu ($inp),$inout0 2312 movdqu 16*1($inp),$inout1 2313 movdqu 16*2($inp),$inout2 2314 pxor @tweak[0],$inout0 2315 movdqu 16*3($inp),$inout3 2316 pxor @tweak[1],$inout1 2317 movdqu 16*4($inp),$inout4 2318 lea 16*5($inp),$inp 2319 pxor @tweak[2],$inout2 2320 pxor @tweak[3],$inout3 2321 pxor @tweak[4],$inout4 2322 2323 call _aesni_decrypt6 2324 2325 xorps @tweak[0],$inout0 2326 xorps @tweak[1],$inout1 2327 xorps @tweak[2],$inout2 2328 movdqu $inout0,($out) 2329 xorps @tweak[3],$inout3 2330 movdqu $inout1,16*1($out) 2331 xorps @tweak[4],$inout4 2332 movdqu $inout2,16*2($out) 2333 pxor $twtmp,$twtmp 2334 movdqu $inout3,16*3($out) 2335 pcmpgtd @tweak[5],$twtmp 2336 movdqu $inout4,16*4($out) 2337 lea 16*5($out),$out 2338 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2339 and \$15,$len_ 2340 jz .Lxts_dec_ret 2341 2342 movdqa @tweak[5],@tweak[0] 2343 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2344 pand $twmask,@tweak[1] # isolate carry and residue 2345 pxor @tweak[5],@tweak[1] 2346 jmp .Lxts_dec_done2 2347 2348 .align 16 2349 .Lxts_dec_one: 2350 movups ($inp),$inout0 2351 lea 16*1($inp),$inp 2352 xorps @tweak[0],$inout0 2353 ___ 2354 &aesni_generate1("dec",$key,$rounds); 2355 $code.=<<___; 2356 xorps @tweak[0],$inout0 2357 movdqa @tweak[1],@tweak[0] 2358 movups $inout0,($out) 2359 movdqa @tweak[2],@tweak[1] 2360 lea 16*1($out),$out 2361 jmp .Lxts_dec_done 2362 2363 .align 16 2364 .Lxts_dec_two: 2365 movups ($inp),$inout0 2366 movups 16($inp),$inout1 2367 lea 32($inp),$inp 2368 xorps @tweak[0],$inout0 2369 xorps @tweak[1],$inout1 2370 2371 call _aesni_decrypt2 2372 2373 xorps @tweak[0],$inout0 2374 movdqa @tweak[2],@tweak[0] 2375 xorps @tweak[1],$inout1 2376 movdqa @tweak[3],@tweak[1] 2377 movups $inout0,($out) 2378 movups $inout1,16*1($out) 2379 lea 16*2($out),$out 2380 jmp .Lxts_dec_done 2381 2382 .align 16 2383 .Lxts_dec_three: 2384 movups ($inp),$inout0 2385 movups 16*1($inp),$inout1 2386 movups 16*2($inp),$inout2 2387 lea 16*3($inp),$inp 2388 xorps @tweak[0],$inout0 2389 xorps @tweak[1],$inout1 2390 xorps @tweak[2],$inout2 2391 2392 call _aesni_decrypt3 2393 2394 xorps @tweak[0],$inout0 2395 movdqa @tweak[3],@tweak[0] 2396 xorps @tweak[1],$inout1 2397 movdqa @tweak[4],@tweak[1] 2398 xorps @tweak[2],$inout2 2399 movups $inout0,($out) 2400 movups $inout1,16*1($out) 2401 movups $inout2,16*2($out) 2402 lea 16*3($out),$out 2403 jmp .Lxts_dec_done 2404 2405 .align 16 2406 .Lxts_dec_four: 2407 movups ($inp),$inout0 2408 movups 16*1($inp),$inout1 2409 movups 16*2($inp),$inout2 2410 xorps @tweak[0],$inout0 2411 movups 16*3($inp),$inout3 2412 lea 16*4($inp),$inp 2413 xorps @tweak[1],$inout1 2414 xorps @tweak[2],$inout2 2415 xorps @tweak[3],$inout3 2416 2417 call _aesni_decrypt4 2418 2419 pxor @tweak[0],$inout0 2420 movdqa @tweak[4],@tweak[0] 2421 pxor @tweak[1],$inout1 2422 movdqa @tweak[5],@tweak[1] 2423 pxor @tweak[2],$inout2 2424 movdqu $inout0,($out) 2425 pxor @tweak[3],$inout3 2426 movdqu $inout1,16*1($out) 2427 movdqu $inout2,16*2($out) 2428 movdqu $inout3,16*3($out) 2429 lea 16*4($out),$out 2430 jmp .Lxts_dec_done 2431 2432 .align 16 2433 .Lxts_dec_done: 2434 and \$15,$len_ 2435 jz .Lxts_dec_ret 2436 .Lxts_dec_done2: 2437 mov $len_,$len 2438 mov $key_,$key # restore $key 2439 mov $rnds_,$rounds # restore $rounds 2440 2441 movups ($inp),$inout0 2442 xorps @tweak[1],$inout0 2443 ___ 2444 &aesni_generate1("dec",$key,$rounds); 2445 $code.=<<___; 2446 xorps @tweak[1],$inout0 2447 movups $inout0,($out) 2448 2449 .Lxts_dec_steal: 2450 movzb 16($inp),%eax # borrow $rounds ... 2451 movzb ($out),%ecx # ... and $key 2452 lea 1($inp),$inp 2453 mov %al,($out) 2454 mov %cl,16($out) 2455 lea 1($out),$out 2456 sub \$1,$len 2457 jnz .Lxts_dec_steal 2458 2459 sub $len_,$out # rewind $out 2460 mov $key_,$key # restore $key 2461 mov $rnds_,$rounds # restore $rounds 2462 2463 movups ($out),$inout0 2464 xorps @tweak[0],$inout0 2465 ___ 2466 &aesni_generate1("dec",$key,$rounds); 2467 $code.=<<___; 2468 xorps @tweak[0],$inout0 2469 movups $inout0,($out) 2470 2471 .Lxts_dec_ret: 2472 ___ 2473 $code.=<<___ if ($win64); 2474 movaps -0xa0(%rbp),%xmm6 2475 movaps -0x90(%rbp),%xmm7 2476 movaps -0x80(%rbp),%xmm8 2477 movaps -0x70(%rbp),%xmm9 2478 movaps -0x60(%rbp),%xmm10 2479 movaps -0x50(%rbp),%xmm11 2480 movaps -0x40(%rbp),%xmm12 2481 movaps -0x30(%rbp),%xmm13 2482 movaps -0x20(%rbp),%xmm14 2483 movaps -0x10(%rbp),%xmm15 2484 ___ 2485 $code.=<<___; 2486 lea (%rbp),%rsp 2487 pop %rbp 2488 .Lxts_dec_epilogue: 2489 ret 2490 .size aesni_xts_decrypt,.-aesni_xts_decrypt 2491 ___ 2492 } }} 2493 2495 ######################################################################## 2496 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 2497 # size_t length, const AES_KEY *key, 2498 # unsigned char *ivp,const int enc); 2499 { 2500 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 2501 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 2502 my $inp_=$key_; 2503 2504 $code.=<<___; 2505 .globl ${PREFIX}_cbc_encrypt 2506 .type ${PREFIX}_cbc_encrypt,\@function,6 2507 .align 16 2508 ${PREFIX}_cbc_encrypt: 2509 test $len,$len # check length 2510 jz .Lcbc_ret 2511 2512 mov 240($key),$rnds_ # key->rounds 2513 mov $key,$key_ # backup $key 2514 test %r9d,%r9d # 6th argument 2515 jz .Lcbc_decrypt 2516 #--------------------------- CBC ENCRYPT ------------------------------# 2517 movups ($ivp),$inout0 # load iv as initial state 2518 mov $rnds_,$rounds 2519 cmp \$16,$len 2520 jb .Lcbc_enc_tail 2521 sub \$16,$len 2522 jmp .Lcbc_enc_loop 2523 .align 16 2524 .Lcbc_enc_loop: 2525 movups ($inp),$inout1 # load input 2526 lea 16($inp),$inp 2527 #xorps $inout1,$inout0 2528 ___ 2529 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 2530 $code.=<<___; 2531 mov $rnds_,$rounds # restore $rounds 2532 mov $key_,$key # restore $key 2533 movups $inout0,0($out) # store output 2534 lea 16($out),$out 2535 sub \$16,$len 2536 jnc .Lcbc_enc_loop 2537 add \$16,$len 2538 jnz .Lcbc_enc_tail 2539 movups $inout0,($ivp) 2540 jmp .Lcbc_ret 2541 2542 .Lcbc_enc_tail: 2543 mov $len,%rcx # zaps $key 2544 xchg $inp,$out # $inp is %rsi and $out is %rdi now 2545 .long 0x9066A4F3 # rep movsb 2546 mov \$16,%ecx # zero tail 2547 sub $len,%rcx 2548 xor %eax,%eax 2549 .long 0x9066AAF3 # rep stosb 2550 lea -16(%rdi),%rdi # rewind $out by 1 block 2551 mov $rnds_,$rounds # restore $rounds 2552 mov %rdi,%rsi # $inp and $out are the same 2553 mov $key_,$key # restore $key 2554 xor $len,$len # len=16 2555 jmp .Lcbc_enc_loop # one more spin 2556 #--------------------------- CBC DECRYPT ------------------------------# 2558 .align 16 2559 .Lcbc_decrypt: 2560 lea (%rsp),%rax 2561 push %rbp 2562 sub \$$frame_size,%rsp 2563 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2564 ___ 2565 $code.=<<___ if ($win64); 2566 movaps %xmm6,0x10(%rsp) 2567 movaps %xmm7,0x20(%rsp) 2568 movaps %xmm8,0x30(%rsp) 2569 movaps %xmm9,0x40(%rsp) 2570 movaps %xmm10,0x50(%rsp) 2571 movaps %xmm11,0x60(%rsp) 2572 movaps %xmm12,0x70(%rsp) 2573 movaps %xmm13,0x80(%rsp) 2574 movaps %xmm14,0x90(%rsp) 2575 movaps %xmm15,0xa0(%rsp) 2576 .Lcbc_decrypt_body: 2577 ___ 2578 $code.=<<___; 2579 lea -8(%rax),%rbp 2580 movups ($ivp),$iv 2581 mov $rnds_,$rounds 2582 cmp \$0x50,$len 2583 jbe .Lcbc_dec_tail 2584 2585 $movkey ($key),$rndkey0 2586 movdqu 0x00($inp),$inout0 # load input 2587 movdqu 0x10($inp),$inout1 2588 movdqa $inout0,$in0 2589 movdqu 0x20($inp),$inout2 2590 movdqa $inout1,$in1 2591 movdqu 0x30($inp),$inout3 2592 movdqa $inout2,$in2 2593 movdqu 0x40($inp),$inout4 2594 movdqa $inout3,$in3 2595 movdqu 0x50($inp),$inout5 2596 movdqa $inout4,$in4 2597 mov OPENSSL_ia32cap_P+4(%rip),%r9d 2598 cmp \$0x70,$len 2599 jbe .Lcbc_dec_six_or_seven 2600 2601 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 2602 sub \$0x50,$len 2603 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 2604 je .Lcbc_dec_loop6_enter 2605 sub \$0x20,$len 2606 lea 0x70($key),$key # size optimization 2607 jmp .Lcbc_dec_loop8_enter 2608 .align 16 2609 .Lcbc_dec_loop8: 2610 movups $inout7,($out) 2611 lea 0x10($out),$out 2612 .Lcbc_dec_loop8_enter: 2613 movdqu 0x60($inp),$inout6 2614 pxor $rndkey0,$inout0 2615 movdqu 0x70($inp),$inout7 2616 pxor $rndkey0,$inout1 2617 $movkey 0x10-0x70($key),$rndkey1 2618 pxor $rndkey0,$inout2 2619 xor $inp_,$inp_ 2620 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 2621 pxor $rndkey0,$inout3 2622 pxor $rndkey0,$inout4 2623 pxor $rndkey0,$inout5 2624 pxor $rndkey0,$inout6 2625 2626 aesdec $rndkey1,$inout0 2627 pxor $rndkey0,$inout7 2628 $movkey 0x20-0x70($key),$rndkey0 2629 aesdec $rndkey1,$inout1 2630 aesdec $rndkey1,$inout2 2631 aesdec $rndkey1,$inout3 2632 aesdec $rndkey1,$inout4 2633 aesdec $rndkey1,$inout5 2634 aesdec $rndkey1,$inout6 2635 setnc ${inp_}b 2636 shl \$7,$inp_ 2637 aesdec $rndkey1,$inout7 2638 add $inp,$inp_ 2639 $movkey 0x30-0x70($key),$rndkey1 2640 ___ 2641 for($i=1;$i<12;$i++) { 2642 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 2643 $code.=<<___ if ($i==7); 2644 cmp \$11,$rounds 2645 ___ 2646 $code.=<<___; 2647 aesdec $rndkeyx,$inout0 2648 aesdec $rndkeyx,$inout1 2649 aesdec $rndkeyx,$inout2 2650 aesdec $rndkeyx,$inout3 2651 aesdec $rndkeyx,$inout4 2652 aesdec $rndkeyx,$inout5 2653 aesdec $rndkeyx,$inout6 2654 aesdec $rndkeyx,$inout7 2655 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 2656 ___ 2657 $code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 2658 nop 2659 ___ 2660 $code.=<<___ if ($i==7); 2661 jb .Lcbc_dec_done 2662 ___ 2663 $code.=<<___ if ($i==9); 2664 je .Lcbc_dec_done 2665 ___ 2666 $code.=<<___ if ($i==11); 2667 jmp .Lcbc_dec_done 2668 ___ 2669 } 2670 $code.=<<___; 2671 .align 16 2672 .Lcbc_dec_done: 2673 aesdec $rndkey1,$inout0 2674 aesdec $rndkey1,$inout1 2675 pxor $rndkey0,$iv 2676 pxor $rndkey0,$in0 2677 aesdec $rndkey1,$inout2 2678 aesdec $rndkey1,$inout3 2679 pxor $rndkey0,$in1 2680 pxor $rndkey0,$in2 2681 aesdec $rndkey1,$inout4 2682 aesdec $rndkey1,$inout5 2683 pxor $rndkey0,$in3 2684 pxor $rndkey0,$in4 2685 aesdec $rndkey1,$inout6 2686 aesdec $rndkey1,$inout7 2687 movdqu 0x50($inp),$rndkey1 2688 2689 aesdeclast $iv,$inout0 2690 movdqu 0x60($inp),$iv # borrow $iv 2691 pxor $rndkey0,$rndkey1 2692 aesdeclast $in0,$inout1 2693 pxor $rndkey0,$iv 2694 movdqu 0x70($inp),$rndkey0 # next IV 2695 aesdeclast $in1,$inout2 2696 lea 0x80($inp),$inp 2697 movdqu 0x00($inp_),$in0 2698 aesdeclast $in2,$inout3 2699 aesdeclast $in3,$inout4 2700 movdqu 0x10($inp_),$in1 2701 movdqu 0x20($inp_),$in2 2702 aesdeclast $in4,$inout5 2703 aesdeclast $rndkey1,$inout6 2704 movdqu 0x30($inp_),$in3 2705 movdqu 0x40($inp_),$in4 2706 aesdeclast $iv,$inout7 2707 movdqa $rndkey0,$iv # return $iv 2708 movdqu 0x50($inp_),$rndkey1 2709 $movkey -0x70($key),$rndkey0 2710 2711 movups $inout0,($out) # store output 2712 movdqa $in0,$inout0 2713 movups $inout1,0x10($out) 2714 movdqa $in1,$inout1 2715 movups $inout2,0x20($out) 2716 movdqa $in2,$inout2 2717 movups $inout3,0x30($out) 2718 movdqa $in3,$inout3 2719 movups $inout4,0x40($out) 2720 movdqa $in4,$inout4 2721 movups $inout5,0x50($out) 2722 movdqa $rndkey1,$inout5 2723 movups $inout6,0x60($out) 2724 lea 0x70($out),$out 2725 2726 sub \$0x80,$len 2727 ja .Lcbc_dec_loop8 2728 2729 movaps $inout7,$inout0 2730 lea -0x70($key),$key 2731 add \$0x70,$len 2732 jle .Lcbc_dec_tail_collected 2733 movups $inout7,($out) 2734 lea 0x10($out),$out 2735 cmp \$0x50,$len 2736 jbe .Lcbc_dec_tail 2737 2738 movaps $in0,$inout0 2739 .Lcbc_dec_six_or_seven: 2740 cmp \$0x60,$len 2741 ja .Lcbc_dec_seven 2742 2743 movaps $inout5,$inout6 2744 call _aesni_decrypt6 2745 pxor $iv,$inout0 # ^= IV 2746 movaps $inout6,$iv 2747 pxor $in0,$inout1 2748 movdqu $inout0,($out) 2749 pxor $in1,$inout2 2750 movdqu $inout1,0x10($out) 2751 pxor $in2,$inout3 2752 movdqu $inout2,0x20($out) 2753 pxor $in3,$inout4 2754 movdqu $inout3,0x30($out) 2755 pxor $in4,$inout5 2756 movdqu $inout4,0x40($out) 2757 lea 0x50($out),$out 2758 movdqa $inout5,$inout0 2759 jmp .Lcbc_dec_tail_collected 2760 2761 .align 16 2762 .Lcbc_dec_seven: 2763 movups 0x60($inp),$inout6 2764 xorps $inout7,$inout7 2765 call _aesni_decrypt8 2766 movups 0x50($inp),$inout7 2767 pxor $iv,$inout0 # ^= IV 2768 movups 0x60($inp),$iv 2769 pxor $in0,$inout1 2770 movdqu $inout0,($out) 2771 pxor $in1,$inout2 2772 movdqu $inout1,0x10($out) 2773 pxor $in2,$inout3 2774 movdqu $inout2,0x20($out) 2775 pxor $in3,$inout4 2776 movdqu $inout3,0x30($out) 2777 pxor $in4,$inout5 2778 movdqu $inout4,0x40($out) 2779 pxor $inout7,$inout6 2780 movdqu $inout5,0x50($out) 2781 lea 0x60($out),$out 2782 movdqa $inout6,$inout0 2783 jmp .Lcbc_dec_tail_collected 2784 2785 .align 16 2786 .Lcbc_dec_loop6: 2787 movups $inout5,($out) 2788 lea 0x10($out),$out 2789 movdqu 0x00($inp),$inout0 # load input 2790 movdqu 0x10($inp),$inout1 2791 movdqa $inout0,$in0 2792 movdqu 0x20($inp),$inout2 2793 movdqa $inout1,$in1 2794 movdqu 0x30($inp),$inout3 2795 movdqa $inout2,$in2 2796 movdqu 0x40($inp),$inout4 2797 movdqa $inout3,$in3 2798 movdqu 0x50($inp),$inout5 2799 movdqa $inout4,$in4 2800 .Lcbc_dec_loop6_enter: 2801 lea 0x60($inp),$inp 2802 movdqa $inout5,$inout6 2803 2804 call _aesni_decrypt6 2805 2806 pxor $iv,$inout0 # ^= IV 2807 movdqa $inout6,$iv 2808 pxor $in0,$inout1 2809 movdqu $inout0,($out) 2810 pxor $in1,$inout2 2811 movdqu $inout1,0x10($out) 2812 pxor $in2,$inout3 2813 movdqu $inout2,0x20($out) 2814 pxor $in3,$inout4 2815 mov $key_,$key 2816 movdqu $inout3,0x30($out) 2817 pxor $in4,$inout5 2818 mov $rnds_,$rounds 2819 movdqu $inout4,0x40($out) 2820 lea 0x50($out),$out 2821 sub \$0x60,$len 2822 ja .Lcbc_dec_loop6 2823 2824 movdqa $inout5,$inout0 2825 add \$0x50,$len 2826 jle .Lcbc_dec_tail_collected 2827 movups $inout5,($out) 2828 lea 0x10($out),$out 2829 2830 .Lcbc_dec_tail: 2831 movups ($inp),$inout0 2832 sub \$0x10,$len 2833 jbe .Lcbc_dec_one 2834 2835 movups 0x10($inp),$inout1 2836 movaps $inout0,$in0 2837 sub \$0x10,$len 2838 jbe .Lcbc_dec_two 2839 2840 movups 0x20($inp),$inout2 2841 movaps $inout1,$in1 2842 sub \$0x10,$len 2843 jbe .Lcbc_dec_three 2844 2845 movups 0x30($inp),$inout3 2846 movaps $inout2,$in2 2847 sub \$0x10,$len 2848 jbe .Lcbc_dec_four 2849 2850 movups 0x40($inp),$inout4 2851 movaps $inout3,$in3 2852 movaps $inout4,$in4 2853 xorps $inout5,$inout5 2854 call _aesni_decrypt6 2855 pxor $iv,$inout0 2856 movaps $in4,$iv 2857 pxor $in0,$inout1 2858 movdqu $inout0,($out) 2859 pxor $in1,$inout2 2860 movdqu $inout1,0x10($out) 2861 pxor $in2,$inout3 2862 movdqu $inout2,0x20($out) 2863 pxor $in3,$inout4 2864 movdqu $inout3,0x30($out) 2865 lea 0x40($out),$out 2866 movdqa $inout4,$inout0 2867 sub \$0x10,$len 2868 jmp .Lcbc_dec_tail_collected 2869 2870 .align 16 2871 .Lcbc_dec_one: 2872 movaps $inout0,$in0 2873 ___ 2874 &aesni_generate1("dec",$key,$rounds); 2875 $code.=<<___; 2876 xorps $iv,$inout0 2877 movaps $in0,$iv 2878 jmp .Lcbc_dec_tail_collected 2879 .align 16 2880 .Lcbc_dec_two: 2881 movaps $inout1,$in1 2882 call _aesni_decrypt2 2883 pxor $iv,$inout0 2884 movaps $in1,$iv 2885 pxor $in0,$inout1 2886 movdqu $inout0,($out) 2887 movdqa $inout1,$inout0 2888 lea 0x10($out),$out 2889 jmp .Lcbc_dec_tail_collected 2890 .align 16 2891 .Lcbc_dec_three: 2892 movaps $inout2,$in2 2893 call _aesni_decrypt3 2894 pxor $iv,$inout0 2895 movaps $in2,$iv 2896 pxor $in0,$inout1 2897 movdqu $inout0,($out) 2898 pxor $in1,$inout2 2899 movdqu $inout1,0x10($out) 2900 movdqa $inout2,$inout0 2901 lea 0x20($out),$out 2902 jmp .Lcbc_dec_tail_collected 2903 .align 16 2904 .Lcbc_dec_four: 2905 movaps $inout3,$in3 2906 call _aesni_decrypt4 2907 pxor $iv,$inout0 2908 movaps $in3,$iv 2909 pxor $in0,$inout1 2910 movdqu $inout0,($out) 2911 pxor $in1,$inout2 2912 movdqu $inout1,0x10($out) 2913 pxor $in2,$inout3 2914 movdqu $inout2,0x20($out) 2915 movdqa $inout3,$inout0 2916 lea 0x30($out),$out 2917 jmp .Lcbc_dec_tail_collected 2918 2919 .align 16 2920 .Lcbc_dec_tail_collected: 2921 movups $iv,($ivp) 2922 and \$15,$len 2923 jnz .Lcbc_dec_tail_partial 2924 movups $inout0,($out) 2925 jmp .Lcbc_dec_ret 2926 .align 16 2927 .Lcbc_dec_tail_partial: 2928 movaps $inout0,(%rsp) 2929 mov \$16,%rcx 2930 mov $out,%rdi 2931 sub $len,%rcx 2932 lea (%rsp),%rsi 2933 .long 0x9066A4F3 # rep movsb 2934 2935 .Lcbc_dec_ret: 2936 ___ 2937 $code.=<<___ if ($win64); 2938 movaps 0x10(%rsp),%xmm6 2939 movaps 0x20(%rsp),%xmm7 2940 movaps 0x30(%rsp),%xmm8 2941 movaps 0x40(%rsp),%xmm9 2942 movaps 0x50(%rsp),%xmm10 2943 movaps 0x60(%rsp),%xmm11 2944 movaps 0x70(%rsp),%xmm12 2945 movaps 0x80(%rsp),%xmm13 2946 movaps 0x90(%rsp),%xmm14 2947 movaps 0xa0(%rsp),%xmm15 2948 ___ 2949 $code.=<<___; 2950 lea (%rbp),%rsp 2951 pop %rbp 2952 .Lcbc_ret: 2953 ret 2954 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 2955 ___ 2956 } 2958 # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, 2959 # int bits, AES_KEY *key) 2960 { my ($inp,$bits,$key) = @_4args; 2961 $bits =~ s/%r/%e/; 2962 2963 $code.=<<___; 2964 .globl ${PREFIX}_set_decrypt_key 2965 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 2966 .align 16 2967 ${PREFIX}_set_decrypt_key: 2968 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 2969 call __aesni_set_encrypt_key 2970 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 2971 test %eax,%eax 2972 jnz .Ldec_key_ret 2973 lea 16($key,$bits),$inp # points at the end of key schedule 2974 2975 $movkey ($key),%xmm0 # just swap 2976 $movkey ($inp),%xmm1 2977 $movkey %xmm0,($inp) 2978 $movkey %xmm1,($key) 2979 lea 16($key),$key 2980 lea -16($inp),$inp 2981 2982 .Ldec_key_inverse: 2983 $movkey ($key),%xmm0 # swap and inverse 2984 $movkey ($inp),%xmm1 2985 aesimc %xmm0,%xmm0 2986 aesimc %xmm1,%xmm1 2987 lea 16($key),$key 2988 lea -16($inp),$inp 2989 $movkey %xmm0,16($inp) 2990 $movkey %xmm1,-16($key) 2991 cmp $key,$inp 2992 ja .Ldec_key_inverse 2993 2994 $movkey ($key),%xmm0 # inverse middle 2995 aesimc %xmm0,%xmm0 2996 $movkey %xmm0,($inp) 2997 .Ldec_key_ret: 2998 add \$8,%rsp 2999 ret 3000 .LSEH_end_set_decrypt_key: 3001 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 3002 ___ 3003 3005 # This is based on submission by 3006 # 3007 # Huang Ying <ying.huang (at] intel.com> 3008 # Vinodh Gopal <vinodh.gopal (at] intel.com> 3009 # Kahraman Akdemir 3010 # 3011 # Agressively optimized in respect to aeskeygenassist's critical path 3012 # and is contained in %xmm0-5 to meet Win64 ABI requirement. 3013 # 3014 $code.=<<___; 3015 .globl ${PREFIX}_set_encrypt_key 3016 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 3017 .align 16 3018 ${PREFIX}_set_encrypt_key: 3019 __aesni_set_encrypt_key: 3020 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3021 mov \$-1,%rax 3022 test $inp,$inp 3023 jz .Lenc_key_ret 3024 test $key,$key 3025 jz .Lenc_key_ret 3026 3027 movups ($inp),%xmm0 # pull first 128 bits of *userKey 3028 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 3029 lea 16($key),%rax 3030 cmp \$256,$bits 3031 je .L14rounds 3032 cmp \$192,$bits 3033 je .L12rounds 3034 cmp \$128,$bits 3035 jne .Lbad_keybits 3036 3037 .L10rounds: 3038 mov \$9,$bits # 10 rounds for 128-bit key 3039 $movkey %xmm0,($key) # round 0 3040 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 3041 call .Lkey_expansion_128_cold 3042 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 3043 call .Lkey_expansion_128 3044 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 3045 call .Lkey_expansion_128 3046 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 3047 call .Lkey_expansion_128 3048 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 3049 call .Lkey_expansion_128 3050 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 3051 call .Lkey_expansion_128 3052 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 3053 call .Lkey_expansion_128 3054 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 3055 call .Lkey_expansion_128 3056 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 3057 call .Lkey_expansion_128 3058 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 3059 call .Lkey_expansion_128 3060 $movkey %xmm0,(%rax) 3061 mov $bits,80(%rax) # 240(%rdx) 3062 xor %eax,%eax 3063 jmp .Lenc_key_ret 3064 3065 .align 16 3066 .L12rounds: 3067 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 3068 mov \$11,$bits # 12 rounds for 192 3069 $movkey %xmm0,($key) # round 0 3070 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 3071 call .Lkey_expansion_192a_cold 3072 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 3073 call .Lkey_expansion_192b 3074 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 3075 call .Lkey_expansion_192a 3076 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 3077 call .Lkey_expansion_192b 3078 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 3079 call .Lkey_expansion_192a 3080 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 3081 call .Lkey_expansion_192b 3082 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 3083 call .Lkey_expansion_192a 3084 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 3085 call .Lkey_expansion_192b 3086 $movkey %xmm0,(%rax) 3087 mov $bits,48(%rax) # 240(%rdx) 3088 xor %rax, %rax 3089 jmp .Lenc_key_ret 3090 3091 .align 16 3092 .L14rounds: 3093 movups 16($inp),%xmm2 # remaning half of *userKey 3094 mov \$13,$bits # 14 rounds for 256 3095 lea 16(%rax),%rax 3096 $movkey %xmm0,($key) # round 0 3097 $movkey %xmm2,16($key) # round 1 3098 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 3099 call .Lkey_expansion_256a_cold 3100 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 3101 call .Lkey_expansion_256b 3102 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 3103 call .Lkey_expansion_256a 3104 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 3105 call .Lkey_expansion_256b 3106 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 3107 call .Lkey_expansion_256a 3108 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 3109 call .Lkey_expansion_256b 3110 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 3111 call .Lkey_expansion_256a 3112 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 3113 call .Lkey_expansion_256b 3114 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 3115 call .Lkey_expansion_256a 3116 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 3117 call .Lkey_expansion_256b 3118 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 3119 call .Lkey_expansion_256a 3120 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 3121 call .Lkey_expansion_256b 3122 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 3123 call .Lkey_expansion_256a 3124 $movkey %xmm0,(%rax) 3125 mov $bits,16(%rax) # 240(%rdx) 3126 xor %rax,%rax 3127 jmp .Lenc_key_ret 3128 3129 .align 16 3130 .Lbad_keybits: 3131 mov \$-2,%rax 3132 .Lenc_key_ret: 3133 add \$8,%rsp 3134 ret 3135 .LSEH_end_set_encrypt_key: 3136 3138 .align 16 3139 .Lkey_expansion_128: 3140 $movkey %xmm0,(%rax) 3141 lea 16(%rax),%rax 3142 .Lkey_expansion_128_cold: 3143 shufps \$0b00010000,%xmm0,%xmm4 3144 xorps %xmm4, %xmm0 3145 shufps \$0b10001100,%xmm0,%xmm4 3146 xorps %xmm4, %xmm0 3147 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3148 xorps %xmm1,%xmm0 3149 ret 3150 3151 .align 16 3152 .Lkey_expansion_192a: 3153 $movkey %xmm0,(%rax) 3154 lea 16(%rax),%rax 3155 .Lkey_expansion_192a_cold: 3156 movaps %xmm2, %xmm5 3157 .Lkey_expansion_192b_warm: 3158 shufps \$0b00010000,%xmm0,%xmm4 3159 movdqa %xmm2,%xmm3 3160 xorps %xmm4,%xmm0 3161 shufps \$0b10001100,%xmm0,%xmm4 3162 pslldq \$4,%xmm3 3163 xorps %xmm4,%xmm0 3164 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 3165 pxor %xmm3,%xmm2 3166 pxor %xmm1,%xmm0 3167 pshufd \$0b11111111,%xmm0,%xmm3 3168 pxor %xmm3,%xmm2 3169 ret 3170 3171 .align 16 3172 .Lkey_expansion_192b: 3173 movaps %xmm0,%xmm3 3174 shufps \$0b01000100,%xmm0,%xmm5 3175 $movkey %xmm5,(%rax) 3176 shufps \$0b01001110,%xmm2,%xmm3 3177 $movkey %xmm3,16(%rax) 3178 lea 32(%rax),%rax 3179 jmp .Lkey_expansion_192b_warm 3180 3181 .align 16 3182 .Lkey_expansion_256a: 3183 $movkey %xmm2,(%rax) 3184 lea 16(%rax),%rax 3185 .Lkey_expansion_256a_cold: 3186 shufps \$0b00010000,%xmm0,%xmm4 3187 xorps %xmm4,%xmm0 3188 shufps \$0b10001100,%xmm0,%xmm4 3189 xorps %xmm4,%xmm0 3190 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3191 xorps %xmm1,%xmm0 3192 ret 3193 3194 .align 16 3195 .Lkey_expansion_256b: 3196 $movkey %xmm0,(%rax) 3197 lea 16(%rax),%rax 3198 3199 shufps \$0b00010000,%xmm2,%xmm4 3200 xorps %xmm4,%xmm2 3201 shufps \$0b10001100,%xmm2,%xmm4 3202 xorps %xmm4,%xmm2 3203 shufps \$0b10101010,%xmm1,%xmm1 # critical path 3204 xorps %xmm1,%xmm2 3205 ret 3206 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 3207 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 3208 ___ 3209 } 3210 3212 $code.=<<___; 3213 .align 64 3214 .Lbswap_mask: 3215 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 3216 .Lincrement32: 3217 .long 6,6,6,0 3218 .Lincrement64: 3219 .long 1,0,0,0 3220 .Lxts_magic: 3221 .long 0x87,0,1,0 3222 .Lincrement1: 3223 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 3224 3225 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 3226 .align 64 3227 ___ 3228 3229 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3230 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 3231 if ($win64) { 3232 $rec="%rcx"; 3233 $frame="%rdx"; 3234 $context="%r8"; 3235 $disp="%r9"; 3236 3237 $code.=<<___; 3238 .extern __imp_RtlVirtualUnwind 3239 ___ 3240 $code.=<<___ if ($PREFIX eq "aesni"); 3241 .type ecb_se_handler,\@abi-omnipotent 3242 .align 16 3243 ecb_se_handler: 3244 push %rsi 3245 push %rdi 3246 push %rbx 3247 push %rbp 3248 push %r12 3249 push %r13 3250 push %r14 3251 push %r15 3252 pushfq 3253 sub \$64,%rsp 3254 3255 mov 152($context),%rax # pull context->Rsp 3256 3257 jmp .Lcommon_seh_tail 3258 .size ecb_se_handler,.-ecb_se_handler 3259 3260 .type ccm64_se_handler,\@abi-omnipotent 3261 .align 16 3262 ccm64_se_handler: 3263 push %rsi 3264 push %rdi 3265 push %rbx 3266 push %rbp 3267 push %r12 3268 push %r13 3269 push %r14 3270 push %r15 3271 pushfq 3272 sub \$64,%rsp 3273 3274 mov 120($context),%rax # pull context->Rax 3275 mov 248($context),%rbx # pull context->Rip 3276 3277 mov 8($disp),%rsi # disp->ImageBase 3278 mov 56($disp),%r11 # disp->HandlerData 3279 3280 mov 0(%r11),%r10d # HandlerData[0] 3281 lea (%rsi,%r10),%r10 # prologue label 3282 cmp %r10,%rbx # context->Rip<prologue label 3283 jb .Lcommon_seh_tail 3284 3285 mov 152($context),%rax # pull context->Rsp 3286 3287 mov 4(%r11),%r10d # HandlerData[1] 3288 lea (%rsi,%r10),%r10 # epilogue label 3289 cmp %r10,%rbx # context->Rip>=epilogue label 3290 jae .Lcommon_seh_tail 3291 3292 lea 0(%rax),%rsi # %xmm save area 3293 lea 512($context),%rdi # &context.Xmm6 3294 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 3295 .long 0xa548f3fc # cld; rep movsq 3296 lea 0x58(%rax),%rax # adjust stack pointer 3297 3298 jmp .Lcommon_seh_tail 3299 .size ccm64_se_handler,.-ccm64_se_handler 3300 3301 .type ctr_xts_se_handler,\@abi-omnipotent 3302 .align 16 3303 ctr_xts_se_handler: 3304 push %rsi 3305 push %rdi 3306 push %rbx 3307 push %rbp 3308 push %r12 3309 push %r13 3310 push %r14 3311 push %r15 3312 pushfq 3313 sub \$64,%rsp 3314 3315 mov 120($context),%rax # pull context->Rax 3316 mov 248($context),%rbx # pull context->Rip 3317 3318 mov 8($disp),%rsi # disp->ImageBase 3319 mov 56($disp),%r11 # disp->HandlerData 3320 3321 mov 0(%r11),%r10d # HandlerData[0] 3322 lea (%rsi,%r10),%r10 # prologue lable 3323 cmp %r10,%rbx # context->Rip<prologue label 3324 jb .Lcommon_seh_tail 3325 3326 mov 152($context),%rax # pull context->Rsp 3327 3328 mov 4(%r11),%r10d # HandlerData[1] 3329 lea (%rsi,%r10),%r10 # epilogue label 3330 cmp %r10,%rbx # context->Rip>=epilogue label 3331 jae .Lcommon_seh_tail 3332 3333 mov 160($context),%rax # pull context->Rbp 3334 lea -0xa0(%rax),%rsi # %xmm save area 3335 lea 512($context),%rdi # & context.Xmm6 3336 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3337 .long 0xa548f3fc # cld; rep movsq 3338 3339 jmp .Lcommon_rbp_tail 3340 .size ctr_xts_se_handler,.-ctr_xts_se_handler 3341 ___ 3342 $code.=<<___; 3343 .type cbc_se_handler,\@abi-omnipotent 3344 .align 16 3345 cbc_se_handler: 3346 push %rsi 3347 push %rdi 3348 push %rbx 3349 push %rbp 3350 push %r12 3351 push %r13 3352 push %r14 3353 push %r15 3354 pushfq 3355 sub \$64,%rsp 3356 3357 mov 152($context),%rax # pull context->Rsp 3358 mov 248($context),%rbx # pull context->Rip 3359 3360 lea .Lcbc_decrypt(%rip),%r10 3361 cmp %r10,%rbx # context->Rip<"prologue" label 3362 jb .Lcommon_seh_tail 3363 3364 lea .Lcbc_decrypt_body(%rip),%r10 3365 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 3366 jb .Lrestore_cbc_rax 3367 3368 lea .Lcbc_ret(%rip),%r10 3369 cmp %r10,%rbx # context->Rip>="epilogue" label 3370 jae .Lcommon_seh_tail 3371 3372 lea 16(%rax),%rsi # %xmm save area 3373 lea 512($context),%rdi # &context.Xmm6 3374 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3375 .long 0xa548f3fc # cld; rep movsq 3376 3377 .Lcommon_rbp_tail: 3378 mov 160($context),%rax # pull context->Rbp 3379 mov (%rax),%rbp # restore saved %rbp 3380 lea 8(%rax),%rax # adjust stack pointer 3381 mov %rbp,160($context) # restore context->Rbp 3382 jmp .Lcommon_seh_tail 3383 3384 .Lrestore_cbc_rax: 3385 mov 120($context),%rax 3386 3387 .Lcommon_seh_tail: 3388 mov 8(%rax),%rdi 3389 mov 16(%rax),%rsi 3390 mov %rax,152($context) # restore context->Rsp 3391 mov %rsi,168($context) # restore context->Rsi 3392 mov %rdi,176($context) # restore context->Rdi 3393 3394 mov 40($disp),%rdi # disp->ContextRecord 3395 mov $context,%rsi # context 3396 mov \$154,%ecx # sizeof(CONTEXT) 3397 .long 0xa548f3fc # cld; rep movsq 3398 3399 mov $disp,%rsi 3400 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3401 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3402 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3403 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3404 mov 40(%rsi),%r10 # disp->ContextRecord 3405 lea 56(%rsi),%r11 # &disp->HandlerData 3406 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3407 mov %r10,32(%rsp) # arg5 3408 mov %r11,40(%rsp) # arg6 3409 mov %r12,48(%rsp) # arg7 3410 mov %rcx,56(%rsp) # arg8, (NULL) 3411 call *__imp_RtlVirtualUnwind(%rip) 3412 3413 mov \$1,%eax # ExceptionContinueSearch 3414 add \$64,%rsp 3415 popfq 3416 pop %r15 3417 pop %r14 3418 pop %r13 3419 pop %r12 3420 pop %rbp 3421 pop %rbx 3422 pop %rdi 3423 pop %rsi 3424 ret 3425 .size cbc_se_handler,.-cbc_se_handler 3426 3427 .section .pdata 3428 .align 4 3429 ___ 3430 $code.=<<___ if ($PREFIX eq "aesni"); 3431 .rva .LSEH_begin_aesni_ecb_encrypt 3432 .rva .LSEH_end_aesni_ecb_encrypt 3433 .rva .LSEH_info_ecb 3434 3435 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 3436 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 3437 .rva .LSEH_info_ccm64_enc 3438 3439 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 3440 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 3441 .rva .LSEH_info_ccm64_dec 3442 3443 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 3444 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 3445 .rva .LSEH_info_ctr32 3446 3447 .rva .LSEH_begin_aesni_xts_encrypt 3448 .rva .LSEH_end_aesni_xts_encrypt 3449 .rva .LSEH_info_xts_enc 3450 3451 .rva .LSEH_begin_aesni_xts_decrypt 3452 .rva .LSEH_end_aesni_xts_decrypt 3453 .rva .LSEH_info_xts_dec 3454 ___ 3455 $code.=<<___; 3456 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 3457 .rva .LSEH_end_${PREFIX}_cbc_encrypt 3458 .rva .LSEH_info_cbc 3459 3460 .rva ${PREFIX}_set_decrypt_key 3461 .rva .LSEH_end_set_decrypt_key 3462 .rva .LSEH_info_key 3463 3464 .rva ${PREFIX}_set_encrypt_key 3465 .rva .LSEH_end_set_encrypt_key 3466 .rva .LSEH_info_key 3467 .section .xdata 3468 .align 8 3469 ___ 3470 $code.=<<___ if ($PREFIX eq "aesni"); 3471 .LSEH_info_ecb: 3472 .byte 9,0,0,0 3473 .rva ecb_se_handler 3474 .LSEH_info_ccm64_enc: 3475 .byte 9,0,0,0 3476 .rva ccm64_se_handler 3477 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 3478 .LSEH_info_ccm64_dec: 3479 .byte 9,0,0,0 3480 .rva ccm64_se_handler 3481 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 3482 .LSEH_info_ctr32: 3483 .byte 9,0,0,0 3484 .rva ctr_xts_se_handler 3485 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 3486 .LSEH_info_xts_enc: 3487 .byte 9,0,0,0 3488 .rva ctr_xts_se_handler 3489 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3490 .LSEH_info_xts_dec: 3491 .byte 9,0,0,0 3492 .rva ctr_xts_se_handler 3493 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3494 ___ 3495 $code.=<<___; 3496 .LSEH_info_cbc: 3497 .byte 9,0,0,0 3498 .rva cbc_se_handler 3499 .LSEH_info_key: 3500 .byte 0x01,0x04,0x01,0x00 3501 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 3502 ___ 3503 } 3504 3505 sub rex { 3506 local *opcode=shift; 3507 my ($dst,$src)=@_; 3508 my $rex=0; 3509 3510 $rex|=0x04 if($dst>=8); 3511 $rex|=0x01 if($src>=8); 3512 push @opcode,$rex|0x40 if($rex); 3513 } 3514 3515 sub aesni { 3516 my $line=shift; 3517 my @opcode=(0x66); 3518 3519 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 3520 rex(\@opcode,$4,$3); 3521 push @opcode,0x0f,0x3a,0xdf; 3522 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 3523 my $c=$2; 3524 push @opcode,$c=~/^0/?oct($c):$c; 3525 return ".byte\t".join(',',@opcode); 3526 } 3527 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 3528 my %opcodelet = ( 3529 "aesimc" => 0xdb, 3530 "aesenc" => 0xdc, "aesenclast" => 0xdd, 3531 "aesdec" => 0xde, "aesdeclast" => 0xdf 3532 ); 3533 return undef if (!defined($opcodelet{$1})); 3534 rex(\@opcode,$3,$2); 3535 push @opcode,0x0f,0x38,$opcodelet{$1}; 3536 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 3537 return ".byte\t".join(',',@opcode); 3538 } 3539 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 3540 my %opcodelet = ( 3541 "aesenc" => 0xdc, "aesenclast" => 0xdd, 3542 "aesdec" => 0xde, "aesdeclast" => 0xdf 3543 ); 3544 return undef if (!defined($opcodelet{$1})); 3545 my $off = $2; 3546 push @opcode,0x44 if ($3>=8); 3547 push @opcode,0x0f,0x38,$opcodelet{$1}; 3548 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 3549 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 3550 return ".byte\t".join(',',@opcode); 3551 } 3552 return $line; 3553 } 3554 3555 sub movbe { 3556 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 3557 } 3558 3559 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 3560 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 3561 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 3562 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 3563 3564 print $code; 3565 3566 close STDOUT; 3567