1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # Given aes(enc|dec) instructions' latency asymptotic performance for 18 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 19 # processed with 128-bit key. And given their throughput asymptotic 20 # performance for parallelizable modes is 1.25 cycles per byte. Being 21 # asymptotic limit it's not something you commonly achieve in reality, 22 # but how close does one get? Below are results collected for 23 # different modes and block sized. Pairs of numbers are for en-/ 24 # decryption. 25 # 26 # 16-byte 64-byte 256-byte 1-KB 8-KB 27 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 28 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 29 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 30 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 31 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 32 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 33 # 34 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means 35 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 36 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 37 # The results were collected with specially crafted speed.c benchmark 38 # in order to compare them with results reported in "Intel Advanced 39 # Encryption Standard (AES) New Instruction Set" White Paper Revision 40 # 3.0 dated May 2010. All above results are consistently better. This 41 # module also provides better performance for block sizes smaller than 42 # 128 bytes in points *not* represented in the above table. 43 # 44 # Looking at the results for 8-KB buffer. 45 # 46 # CFB and OFB results are far from the limit, because implementation 47 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 48 # single-block aesni_encrypt, which is not the most optimal way to go. 49 # CBC encrypt result is unexpectedly high and there is no documented 50 # explanation for it. Seemingly there is a small penalty for feeding 51 # the result back to AES unit the way it's done in CBC mode. There is 52 # nothing one can do and the result appears optimal. CCM result is 53 # identical to CBC, because CBC-MAC is essentially CBC encrypt without 54 # saving output. CCM CTR "stays invisible," because it's neatly 55 # interleaved wih CBC-MAC. This provides ~30% improvement over 56 # "straghtforward" CCM implementation with CTR and CBC-MAC performed 57 # disjointly. Parallelizable modes practically achieve the theoretical 58 # limit. 59 # 60 # Looking at how results vary with buffer size. 61 # 62 # Curves are practically saturated at 1-KB buffer size. In most cases 63 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 64 # CTR curve doesn't follow this pattern and is "slowest" changing one 65 # with "256-byte" result being 87% of "8-KB." This is because overhead 66 # in CTR mode is most computationally intensive. Small-block CCM 67 # decrypt is slower than encrypt, because first CTR and last CBC-MAC 68 # iterations can't be interleaved. 69 # 70 # Results for 192- and 256-bit keys. 71 # 72 # EVP-free results were observed to scale perfectly with number of 73 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times 74 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 75 # are a tad smaller, because the above mentioned penalty biases all 76 # results by same constant value. In similar way function call 77 # overhead affects small-block performance, as well as OFB and CFB 78 # results. Differences are not large, most common coefficients are 79 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 80 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 81 82 # January 2011 83 # 84 # While Westmere processor features 6 cycles latency for aes[enc|dec] 85 # instructions, which can be scheduled every second cycle, Sandy 86 # Bridge spends 8 cycles per instruction, but it can schedule them 87 # every cycle. This means that code targeting Westmere would perform 88 # suboptimally on Sandy Bridge. Therefore this update. 89 # 90 # In addition, non-parallelizable CBC encrypt (as well as CCM) is 91 # optimized. Relative improvement might appear modest, 8% on Westmere, 92 # but in absolute terms it's 3.77 cycles per byte encrypted with 93 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 94 # should be compared to asymptotic limits of 3.75 for Westmere and 95 # 5.00 for Sandy Bridge. Actually, the fact that they get this close 96 # to asymptotic limits is quite amazing. Indeed, the limit is 97 # calculated as latency times number of rounds, 10 for 128-bit key, 98 # and divided by 16, the number of bytes in block, or in other words 99 # it accounts *solely* for aesenc instructions. But there are extra 100 # instructions, and numbers so close to the asymptotic limits mean 101 # that it's as if it takes as little as *one* additional cycle to 102 # execute all of them. How is it possible? It is possible thanks to 103 # out-of-order execution logic, which manages to overlap post- 104 # processing of previous block, things like saving the output, with 105 # actual encryption of current block, as well as pre-processing of 106 # current block, things like fetching input and xor-ing it with 107 # 0-round element of the key schedule, with actual encryption of 108 # previous block. Keep this in mind... 109 # 110 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 111 # performance is achieved by interleaving instructions working on 112 # independent blocks. In which case asymptotic limit for such modes 113 # can be obtained by dividing above mentioned numbers by AES 114 # instructions' interleave factor. Westmere can execute at most 3 115 # instructions at a time, meaning that optimal interleave factor is 3, 116 # and that's where the "magic" number of 1.25 come from. "Optimal 117 # interleave factor" means that increase of interleave factor does 118 # not improve performance. The formula has proven to reflect reality 119 # pretty well on Westmere... Sandy Bridge on the other hand can 120 # execute up to 8 AES instructions at a time, so how does varying 121 # interleave factor affect the performance? Here is table for ECB 122 # (numbers are cycles per byte processed with 128-bit key): 123 # 124 # instruction interleave factor 3x 6x 8x 125 # theoretical asymptotic limit 1.67 0.83 0.625 126 # measured performance for 8KB block 1.05 0.86 0.84 127 # 128 # "as if" interleave factor 4.7x 5.8x 6.0x 129 # 130 # Further data for other parallelizable modes: 131 # 132 # CBC decrypt 1.16 0.93 0.74 133 # CTR 1.14 0.91 0.74 134 # 135 # Well, given 3x column it's probably inappropriate to call the limit 136 # asymptotic, if it can be surpassed, isn't it? What happens there? 137 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution 138 # magic is responsible for this. Processor overlaps not only the 139 # additional instructions with AES ones, but even AES instuctions 140 # processing adjacent triplets of independent blocks. In the 6x case 141 # additional instructions still claim disproportionally small amount 142 # of additional cycles, but in 8x case number of instructions must be 143 # a tad too high for out-of-order logic to cope with, and AES unit 144 # remains underutilized... As you can see 8x interleave is hardly 145 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 146 # utilizies 6x interleave because of limited register bank capacity. 147 # 148 # Higher interleave factors do have negative impact on Westmere 149 # performance. While for ECB mode it's negligible ~1.5%, other 150 # parallelizables perform ~5% worse, which is outweighed by ~25% 151 # improvement on Sandy Bridge. To balance regression on Westmere 152 # CTR mode was implemented with 6x aesenc interleave factor. 153 154 # April 2011 155 # 156 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 157 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 158 # in CTR mode AES instruction interleave factor was chosen to be 6x. 159 160 ###################################################################### 161 # Current large-block performance in cycles per byte processed with 162 # 128-bit key (less is better). 163 # 164 # CBC en-/decrypt CTR XTS ECB 165 # Westmere 3.77/1.25 1.25 1.25 1.26 166 # * Bridge 5.07/0.74 0.75 0.90 0.85 167 # Haswell 4.44/0.63 0.63 0.73 0.63 168 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 169 # Bulldozer 5.77/0.70 0.72 0.90 0.70 170 # 171 # (*) Atom Silvermont ECB result is suboptimal because of penalties 172 # incurred by operations on %xmm8-15. As ECB is not considered 173 # critical, nothing was done to mitigate the problem. 174 175 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 176 # generates drop-in replacement for 177 # crypto/aes/asm/aes-x86_64.pl:-) 178 179 $flavour = shift; 180 $output = shift; 181 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 182 183 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 184 185 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 186 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 187 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 188 die "can't locate x86_64-xlate.pl"; 189 190 open OUT,"| \"$^X\" $xlate $flavour $output"; 191 *STDOUT=*OUT; 192 193 $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 194 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 195 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 196 197 $code=".text\n"; 198 $code.=".extern OPENSSL_ia32cap_P\n"; 199 200 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 201 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 202 $inp="%rdi"; 203 $out="%rsi"; 204 $len="%rdx"; 205 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 206 $ivp="%r8"; # cbc, ctr, ... 207 208 $rnds_="%r10d"; # backup copy for $rounds 209 $key_="%r11"; # backup copy for $key 210 211 # %xmm register layout 212 $rndkey0="%xmm0"; $rndkey1="%xmm1"; 213 $inout0="%xmm2"; $inout1="%xmm3"; 214 $inout2="%xmm4"; $inout3="%xmm5"; 215 $inout4="%xmm6"; $inout5="%xmm7"; 216 $inout6="%xmm8"; $inout7="%xmm9"; 217 218 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 219 $in0="%xmm8"; $iv="%xmm9"; 220 222 # Inline version of internal aesni_[en|de]crypt1. 223 # 224 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate 225 # cycles which take care of loop variables... 226 { my $sn; 227 sub aesni_generate1 { 228 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 229 ++$sn; 230 $code.=<<___; 231 $movkey ($key),$rndkey0 232 $movkey 16($key),$rndkey1 233 ___ 234 $code.=<<___ if (defined($ivec)); 235 xorps $rndkey0,$ivec 236 lea 32($key),$key 237 xorps $ivec,$inout 238 ___ 239 $code.=<<___ if (!defined($ivec)); 240 lea 32($key),$key 241 xorps $rndkey0,$inout 242 ___ 243 $code.=<<___; 244 .Loop_${p}1_$sn: 245 aes${p} $rndkey1,$inout 246 dec $rounds 247 $movkey ($key),$rndkey1 248 lea 16($key),$key 249 jnz .Loop_${p}1_$sn # loop body is 16 bytes 250 aes${p}last $rndkey1,$inout 251 ___ 252 }} 253 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 254 # 255 { my ($inp,$out,$key) = @_4args; 256 257 $code.=<<___; 258 .globl ${PREFIX}_encrypt 259 .type ${PREFIX}_encrypt,\@abi-omnipotent 260 .align 16 261 ${PREFIX}_encrypt: 262 movups ($inp),$inout0 # load input 263 mov 240($key),$rounds # key->rounds 264 ___ 265 &aesni_generate1("enc",$key,$rounds); 266 $code.=<<___; 267 pxor $rndkey0,$rndkey0 # clear register bank 268 pxor $rndkey1,$rndkey1 269 movups $inout0,($out) # output 270 pxor $inout0,$inout0 271 ret 272 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 273 274 .globl ${PREFIX}_decrypt 275 .type ${PREFIX}_decrypt,\@abi-omnipotent 276 .align 16 277 ${PREFIX}_decrypt: 278 movups ($inp),$inout0 # load input 279 mov 240($key),$rounds # key->rounds 280 ___ 281 &aesni_generate1("dec",$key,$rounds); 282 $code.=<<___; 283 pxor $rndkey0,$rndkey0 # clear register bank 284 pxor $rndkey1,$rndkey1 285 movups $inout0,($out) # output 286 pxor $inout0,$inout0 287 ret 288 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 289 ___ 290 } 291 293 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 294 # factor. Why 3x subroutine were originally used in loops? Even though 295 # aes[enc|dec] latency was originally 6, it could be scheduled only 296 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 297 # utilization, i.e. when subroutine's throughput is virtually same as 298 # of non-interleaved subroutine [for number of input blocks up to 3]. 299 # This is why it originally made no sense to implement 2x subroutine. 300 # But times change and it became appropriate to spend extra 192 bytes 301 # on 2x subroutine on Atom Silvermont account. For processors that 302 # can schedule aes[enc|dec] every cycle optimal interleave factor 303 # equals to corresponding instructions latency. 8x is optimal for 304 # * Bridge and "super-optimal" for other Intel CPUs... 305 306 sub aesni_generate2 { 307 my $dir=shift; 308 # As already mentioned it takes in $key and $rounds, which are *not* 309 # preserved. $inout[0-1] is cipher/clear text... 310 $code.=<<___; 311 .type _aesni_${dir}rypt2,\@abi-omnipotent 312 .align 16 313 _aesni_${dir}rypt2: 314 $movkey ($key),$rndkey0 315 shl \$4,$rounds 316 $movkey 16($key),$rndkey1 317 xorps $rndkey0,$inout0 318 xorps $rndkey0,$inout1 319 $movkey 32($key),$rndkey0 320 lea 32($key,$rounds),$key 321 neg %rax # $rounds 322 add \$16,%rax 323 324 .L${dir}_loop2: 325 aes${dir} $rndkey1,$inout0 326 aes${dir} $rndkey1,$inout1 327 $movkey ($key,%rax),$rndkey1 328 add \$32,%rax 329 aes${dir} $rndkey0,$inout0 330 aes${dir} $rndkey0,$inout1 331 $movkey -16($key,%rax),$rndkey0 332 jnz .L${dir}_loop2 333 334 aes${dir} $rndkey1,$inout0 335 aes${dir} $rndkey1,$inout1 336 aes${dir}last $rndkey0,$inout0 337 aes${dir}last $rndkey0,$inout1 338 ret 339 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 340 ___ 341 } 342 sub aesni_generate3 { 343 my $dir=shift; 344 # As already mentioned it takes in $key and $rounds, which are *not* 345 # preserved. $inout[0-2] is cipher/clear text... 346 $code.=<<___; 347 .type _aesni_${dir}rypt3,\@abi-omnipotent 348 .align 16 349 _aesni_${dir}rypt3: 350 $movkey ($key),$rndkey0 351 shl \$4,$rounds 352 $movkey 16($key),$rndkey1 353 xorps $rndkey0,$inout0 354 xorps $rndkey0,$inout1 355 xorps $rndkey0,$inout2 356 $movkey 32($key),$rndkey0 357 lea 32($key,$rounds),$key 358 neg %rax # $rounds 359 add \$16,%rax 360 361 .L${dir}_loop3: 362 aes${dir} $rndkey1,$inout0 363 aes${dir} $rndkey1,$inout1 364 aes${dir} $rndkey1,$inout2 365 $movkey ($key,%rax),$rndkey1 366 add \$32,%rax 367 aes${dir} $rndkey0,$inout0 368 aes${dir} $rndkey0,$inout1 369 aes${dir} $rndkey0,$inout2 370 $movkey -16($key,%rax),$rndkey0 371 jnz .L${dir}_loop3 372 373 aes${dir} $rndkey1,$inout0 374 aes${dir} $rndkey1,$inout1 375 aes${dir} $rndkey1,$inout2 376 aes${dir}last $rndkey0,$inout0 377 aes${dir}last $rndkey0,$inout1 378 aes${dir}last $rndkey0,$inout2 379 ret 380 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 381 ___ 382 } 383 # 4x interleave is implemented to improve small block performance, 384 # most notably [and naturally] 4 block by ~30%. One can argue that one 385 # should have implemented 5x as well, but improvement would be <20%, 386 # so it's not worth it... 387 sub aesni_generate4 { 388 my $dir=shift; 389 # As already mentioned it takes in $key and $rounds, which are *not* 390 # preserved. $inout[0-3] is cipher/clear text... 391 $code.=<<___; 392 .type _aesni_${dir}rypt4,\@abi-omnipotent 393 .align 16 394 _aesni_${dir}rypt4: 395 $movkey ($key),$rndkey0 396 shl \$4,$rounds 397 $movkey 16($key),$rndkey1 398 xorps $rndkey0,$inout0 399 xorps $rndkey0,$inout1 400 xorps $rndkey0,$inout2 401 xorps $rndkey0,$inout3 402 $movkey 32($key),$rndkey0 403 lea 32($key,$rounds),$key 404 neg %rax # $rounds 405 .byte 0x0f,0x1f,0x00 406 add \$16,%rax 407 408 .L${dir}_loop4: 409 aes${dir} $rndkey1,$inout0 410 aes${dir} $rndkey1,$inout1 411 aes${dir} $rndkey1,$inout2 412 aes${dir} $rndkey1,$inout3 413 $movkey ($key,%rax),$rndkey1 414 add \$32,%rax 415 aes${dir} $rndkey0,$inout0 416 aes${dir} $rndkey0,$inout1 417 aes${dir} $rndkey0,$inout2 418 aes${dir} $rndkey0,$inout3 419 $movkey -16($key,%rax),$rndkey0 420 jnz .L${dir}_loop4 421 422 aes${dir} $rndkey1,$inout0 423 aes${dir} $rndkey1,$inout1 424 aes${dir} $rndkey1,$inout2 425 aes${dir} $rndkey1,$inout3 426 aes${dir}last $rndkey0,$inout0 427 aes${dir}last $rndkey0,$inout1 428 aes${dir}last $rndkey0,$inout2 429 aes${dir}last $rndkey0,$inout3 430 ret 431 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 432 ___ 433 } 434 sub aesni_generate6 { 435 my $dir=shift; 436 # As already mentioned it takes in $key and $rounds, which are *not* 437 # preserved. $inout[0-5] is cipher/clear text... 438 $code.=<<___; 439 .type _aesni_${dir}rypt6,\@abi-omnipotent 440 .align 16 441 _aesni_${dir}rypt6: 442 $movkey ($key),$rndkey0 443 shl \$4,$rounds 444 $movkey 16($key),$rndkey1 445 xorps $rndkey0,$inout0 446 pxor $rndkey0,$inout1 447 pxor $rndkey0,$inout2 448 aes${dir} $rndkey1,$inout0 449 lea 32($key,$rounds),$key 450 neg %rax # $rounds 451 aes${dir} $rndkey1,$inout1 452 pxor $rndkey0,$inout3 453 pxor $rndkey0,$inout4 454 aes${dir} $rndkey1,$inout2 455 pxor $rndkey0,$inout5 456 $movkey ($key,%rax),$rndkey0 457 add \$16,%rax 458 jmp .L${dir}_loop6_enter 459 .align 16 460 .L${dir}_loop6: 461 aes${dir} $rndkey1,$inout0 462 aes${dir} $rndkey1,$inout1 463 aes${dir} $rndkey1,$inout2 464 .L${dir}_loop6_enter: 465 aes${dir} $rndkey1,$inout3 466 aes${dir} $rndkey1,$inout4 467 aes${dir} $rndkey1,$inout5 468 $movkey ($key,%rax),$rndkey1 469 add \$32,%rax 470 aes${dir} $rndkey0,$inout0 471 aes${dir} $rndkey0,$inout1 472 aes${dir} $rndkey0,$inout2 473 aes${dir} $rndkey0,$inout3 474 aes${dir} $rndkey0,$inout4 475 aes${dir} $rndkey0,$inout5 476 $movkey -16($key,%rax),$rndkey0 477 jnz .L${dir}_loop6 478 479 aes${dir} $rndkey1,$inout0 480 aes${dir} $rndkey1,$inout1 481 aes${dir} $rndkey1,$inout2 482 aes${dir} $rndkey1,$inout3 483 aes${dir} $rndkey1,$inout4 484 aes${dir} $rndkey1,$inout5 485 aes${dir}last $rndkey0,$inout0 486 aes${dir}last $rndkey0,$inout1 487 aes${dir}last $rndkey0,$inout2 488 aes${dir}last $rndkey0,$inout3 489 aes${dir}last $rndkey0,$inout4 490 aes${dir}last $rndkey0,$inout5 491 ret 492 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 493 ___ 494 } 495 sub aesni_generate8 { 496 my $dir=shift; 497 # As already mentioned it takes in $key and $rounds, which are *not* 498 # preserved. $inout[0-7] is cipher/clear text... 499 $code.=<<___; 500 .type _aesni_${dir}rypt8,\@abi-omnipotent 501 .align 16 502 _aesni_${dir}rypt8: 503 $movkey ($key),$rndkey0 504 shl \$4,$rounds 505 $movkey 16($key),$rndkey1 506 xorps $rndkey0,$inout0 507 xorps $rndkey0,$inout1 508 pxor $rndkey0,$inout2 509 pxor $rndkey0,$inout3 510 pxor $rndkey0,$inout4 511 lea 32($key,$rounds),$key 512 neg %rax # $rounds 513 aes${dir} $rndkey1,$inout0 514 pxor $rndkey0,$inout5 515 pxor $rndkey0,$inout6 516 aes${dir} $rndkey1,$inout1 517 pxor $rndkey0,$inout7 518 $movkey ($key,%rax),$rndkey0 519 add \$16,%rax 520 jmp .L${dir}_loop8_inner 521 .align 16 522 .L${dir}_loop8: 523 aes${dir} $rndkey1,$inout0 524 aes${dir} $rndkey1,$inout1 525 .L${dir}_loop8_inner: 526 aes${dir} $rndkey1,$inout2 527 aes${dir} $rndkey1,$inout3 528 aes${dir} $rndkey1,$inout4 529 aes${dir} $rndkey1,$inout5 530 aes${dir} $rndkey1,$inout6 531 aes${dir} $rndkey1,$inout7 532 .L${dir}_loop8_enter: 533 $movkey ($key,%rax),$rndkey1 534 add \$32,%rax 535 aes${dir} $rndkey0,$inout0 536 aes${dir} $rndkey0,$inout1 537 aes${dir} $rndkey0,$inout2 538 aes${dir} $rndkey0,$inout3 539 aes${dir} $rndkey0,$inout4 540 aes${dir} $rndkey0,$inout5 541 aes${dir} $rndkey0,$inout6 542 aes${dir} $rndkey0,$inout7 543 $movkey -16($key,%rax),$rndkey0 544 jnz .L${dir}_loop8 545 546 aes${dir} $rndkey1,$inout0 547 aes${dir} $rndkey1,$inout1 548 aes${dir} $rndkey1,$inout2 549 aes${dir} $rndkey1,$inout3 550 aes${dir} $rndkey1,$inout4 551 aes${dir} $rndkey1,$inout5 552 aes${dir} $rndkey1,$inout6 553 aes${dir} $rndkey1,$inout7 554 aes${dir}last $rndkey0,$inout0 555 aes${dir}last $rndkey0,$inout1 556 aes${dir}last $rndkey0,$inout2 557 aes${dir}last $rndkey0,$inout3 558 aes${dir}last $rndkey0,$inout4 559 aes${dir}last $rndkey0,$inout5 560 aes${dir}last $rndkey0,$inout6 561 aes${dir}last $rndkey0,$inout7 562 ret 563 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 564 ___ 565 } 566 &aesni_generate2("enc") if ($PREFIX eq "aesni"); 567 &aesni_generate2("dec"); 568 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 569 &aesni_generate3("dec"); 570 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 571 &aesni_generate4("dec"); 572 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 573 &aesni_generate6("dec"); 574 &aesni_generate8("enc") if ($PREFIX eq "aesni"); 575 &aesni_generate8("dec"); 576 578 if ($PREFIX eq "aesni") { 579 ######################################################################## 580 # void aesni_ecb_encrypt (const void *in, void *out, 581 # size_t length, const AES_KEY *key, 582 # int enc); 583 $code.=<<___; 584 .globl aesni_ecb_encrypt 585 .type aesni_ecb_encrypt,\@function,5 586 .align 16 587 aesni_ecb_encrypt: 588 ___ 589 $code.=<<___ if ($win64); 590 lea -0x58(%rsp),%rsp 591 movaps %xmm6,(%rsp) # offload $inout4..7 592 movaps %xmm7,0x10(%rsp) 593 movaps %xmm8,0x20(%rsp) 594 movaps %xmm9,0x30(%rsp) 595 .Lecb_enc_body: 596 ___ 597 $code.=<<___; 598 and \$-16,$len # if ($len<16) 599 jz .Lecb_ret # return 600 601 mov 240($key),$rounds # key->rounds 602 $movkey ($key),$rndkey0 603 mov $key,$key_ # backup $key 604 mov $rounds,$rnds_ # backup $rounds 605 test %r8d,%r8d # 5th argument 606 jz .Lecb_decrypt 607 #--------------------------- ECB ENCRYPT ------------------------------# 608 cmp \$0x80,$len # if ($len<8*16) 609 jb .Lecb_enc_tail # short input 610 611 movdqu ($inp),$inout0 # load 8 input blocks 612 movdqu 0x10($inp),$inout1 613 movdqu 0x20($inp),$inout2 614 movdqu 0x30($inp),$inout3 615 movdqu 0x40($inp),$inout4 616 movdqu 0x50($inp),$inout5 617 movdqu 0x60($inp),$inout6 618 movdqu 0x70($inp),$inout7 619 lea 0x80($inp),$inp # $inp+=8*16 620 sub \$0x80,$len # $len-=8*16 (can be zero) 621 jmp .Lecb_enc_loop8_enter 622 .align 16 623 .Lecb_enc_loop8: 624 movups $inout0,($out) # store 8 output blocks 625 mov $key_,$key # restore $key 626 movdqu ($inp),$inout0 # load 8 input blocks 627 mov $rnds_,$rounds # restore $rounds 628 movups $inout1,0x10($out) 629 movdqu 0x10($inp),$inout1 630 movups $inout2,0x20($out) 631 movdqu 0x20($inp),$inout2 632 movups $inout3,0x30($out) 633 movdqu 0x30($inp),$inout3 634 movups $inout4,0x40($out) 635 movdqu 0x40($inp),$inout4 636 movups $inout5,0x50($out) 637 movdqu 0x50($inp),$inout5 638 movups $inout6,0x60($out) 639 movdqu 0x60($inp),$inout6 640 movups $inout7,0x70($out) 641 lea 0x80($out),$out # $out+=8*16 642 movdqu 0x70($inp),$inout7 643 lea 0x80($inp),$inp # $inp+=8*16 644 .Lecb_enc_loop8_enter: 645 646 call _aesni_encrypt8 647 648 sub \$0x80,$len 649 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 650 651 movups $inout0,($out) # store 8 output blocks 652 mov $key_,$key # restore $key 653 movups $inout1,0x10($out) 654 mov $rnds_,$rounds # restore $rounds 655 movups $inout2,0x20($out) 656 movups $inout3,0x30($out) 657 movups $inout4,0x40($out) 658 movups $inout5,0x50($out) 659 movups $inout6,0x60($out) 660 movups $inout7,0x70($out) 661 lea 0x80($out),$out # $out+=8*16 662 add \$0x80,$len # restore real remaining $len 663 jz .Lecb_ret # done if ($len==0) 664 665 .Lecb_enc_tail: # $len is less than 8*16 666 movups ($inp),$inout0 667 cmp \$0x20,$len 668 jb .Lecb_enc_one 669 movups 0x10($inp),$inout1 670 je .Lecb_enc_two 671 movups 0x20($inp),$inout2 672 cmp \$0x40,$len 673 jb .Lecb_enc_three 674 movups 0x30($inp),$inout3 675 je .Lecb_enc_four 676 movups 0x40($inp),$inout4 677 cmp \$0x60,$len 678 jb .Lecb_enc_five 679 movups 0x50($inp),$inout5 680 je .Lecb_enc_six 681 movdqu 0x60($inp),$inout6 682 xorps $inout7,$inout7 683 call _aesni_encrypt8 684 movups $inout0,($out) # store 7 output blocks 685 movups $inout1,0x10($out) 686 movups $inout2,0x20($out) 687 movups $inout3,0x30($out) 688 movups $inout4,0x40($out) 689 movups $inout5,0x50($out) 690 movups $inout6,0x60($out) 691 jmp .Lecb_ret 692 .align 16 693 .Lecb_enc_one: 694 ___ 695 &aesni_generate1("enc",$key,$rounds); 696 $code.=<<___; 697 movups $inout0,($out) # store one output block 698 jmp .Lecb_ret 699 .align 16 700 .Lecb_enc_two: 701 call _aesni_encrypt2 702 movups $inout0,($out) # store 2 output blocks 703 movups $inout1,0x10($out) 704 jmp .Lecb_ret 705 .align 16 706 .Lecb_enc_three: 707 call _aesni_encrypt3 708 movups $inout0,($out) # store 3 output blocks 709 movups $inout1,0x10($out) 710 movups $inout2,0x20($out) 711 jmp .Lecb_ret 712 .align 16 713 .Lecb_enc_four: 714 call _aesni_encrypt4 715 movups $inout0,($out) # store 4 output blocks 716 movups $inout1,0x10($out) 717 movups $inout2,0x20($out) 718 movups $inout3,0x30($out) 719 jmp .Lecb_ret 720 .align 16 721 .Lecb_enc_five: 722 xorps $inout5,$inout5 723 call _aesni_encrypt6 724 movups $inout0,($out) # store 5 output blocks 725 movups $inout1,0x10($out) 726 movups $inout2,0x20($out) 727 movups $inout3,0x30($out) 728 movups $inout4,0x40($out) 729 jmp .Lecb_ret 730 .align 16 731 .Lecb_enc_six: 732 call _aesni_encrypt6 733 movups $inout0,($out) # store 6 output blocks 734 movups $inout1,0x10($out) 735 movups $inout2,0x20($out) 736 movups $inout3,0x30($out) 737 movups $inout4,0x40($out) 738 movups $inout5,0x50($out) 739 jmp .Lecb_ret 740 #--------------------------- ECB DECRYPT ------------------------------# 742 .align 16 743 .Lecb_decrypt: 744 cmp \$0x80,$len # if ($len<8*16) 745 jb .Lecb_dec_tail # short input 746 747 movdqu ($inp),$inout0 # load 8 input blocks 748 movdqu 0x10($inp),$inout1 749 movdqu 0x20($inp),$inout2 750 movdqu 0x30($inp),$inout3 751 movdqu 0x40($inp),$inout4 752 movdqu 0x50($inp),$inout5 753 movdqu 0x60($inp),$inout6 754 movdqu 0x70($inp),$inout7 755 lea 0x80($inp),$inp # $inp+=8*16 756 sub \$0x80,$len # $len-=8*16 (can be zero) 757 jmp .Lecb_dec_loop8_enter 758 .align 16 759 .Lecb_dec_loop8: 760 movups $inout0,($out) # store 8 output blocks 761 mov $key_,$key # restore $key 762 movdqu ($inp),$inout0 # load 8 input blocks 763 mov $rnds_,$rounds # restore $rounds 764 movups $inout1,0x10($out) 765 movdqu 0x10($inp),$inout1 766 movups $inout2,0x20($out) 767 movdqu 0x20($inp),$inout2 768 movups $inout3,0x30($out) 769 movdqu 0x30($inp),$inout3 770 movups $inout4,0x40($out) 771 movdqu 0x40($inp),$inout4 772 movups $inout5,0x50($out) 773 movdqu 0x50($inp),$inout5 774 movups $inout6,0x60($out) 775 movdqu 0x60($inp),$inout6 776 movups $inout7,0x70($out) 777 lea 0x80($out),$out # $out+=8*16 778 movdqu 0x70($inp),$inout7 779 lea 0x80($inp),$inp # $inp+=8*16 780 .Lecb_dec_loop8_enter: 781 782 call _aesni_decrypt8 783 784 $movkey ($key_),$rndkey0 785 sub \$0x80,$len 786 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 787 788 movups $inout0,($out) # store 8 output blocks 789 pxor $inout0,$inout0 # clear register bank 790 mov $key_,$key # restore $key 791 movups $inout1,0x10($out) 792 pxor $inout1,$inout1 793 mov $rnds_,$rounds # restore $rounds 794 movups $inout2,0x20($out) 795 pxor $inout2,$inout2 796 movups $inout3,0x30($out) 797 pxor $inout3,$inout3 798 movups $inout4,0x40($out) 799 pxor $inout4,$inout4 800 movups $inout5,0x50($out) 801 pxor $inout5,$inout5 802 movups $inout6,0x60($out) 803 pxor $inout6,$inout6 804 movups $inout7,0x70($out) 805 pxor $inout7,$inout7 806 lea 0x80($out),$out # $out+=8*16 807 add \$0x80,$len # restore real remaining $len 808 jz .Lecb_ret # done if ($len==0) 809 810 .Lecb_dec_tail: 811 movups ($inp),$inout0 812 cmp \$0x20,$len 813 jb .Lecb_dec_one 814 movups 0x10($inp),$inout1 815 je .Lecb_dec_two 816 movups 0x20($inp),$inout2 817 cmp \$0x40,$len 818 jb .Lecb_dec_three 819 movups 0x30($inp),$inout3 820 je .Lecb_dec_four 821 movups 0x40($inp),$inout4 822 cmp \$0x60,$len 823 jb .Lecb_dec_five 824 movups 0x50($inp),$inout5 825 je .Lecb_dec_six 826 movups 0x60($inp),$inout6 827 $movkey ($key),$rndkey0 828 xorps $inout7,$inout7 829 call _aesni_decrypt8 830 movups $inout0,($out) # store 7 output blocks 831 pxor $inout0,$inout0 # clear register bank 832 movups $inout1,0x10($out) 833 pxor $inout1,$inout1 834 movups $inout2,0x20($out) 835 pxor $inout2,$inout2 836 movups $inout3,0x30($out) 837 pxor $inout3,$inout3 838 movups $inout4,0x40($out) 839 pxor $inout4,$inout4 840 movups $inout5,0x50($out) 841 pxor $inout5,$inout5 842 movups $inout6,0x60($out) 843 pxor $inout6,$inout6 844 pxor $inout7,$inout7 845 jmp .Lecb_ret 846 .align 16 847 .Lecb_dec_one: 848 ___ 849 &aesni_generate1("dec",$key,$rounds); 850 $code.=<<___; 851 movups $inout0,($out) # store one output block 852 pxor $inout0,$inout0 # clear register bank 853 jmp .Lecb_ret 854 .align 16 855 .Lecb_dec_two: 856 call _aesni_decrypt2 857 movups $inout0,($out) # store 2 output blocks 858 pxor $inout0,$inout0 # clear register bank 859 movups $inout1,0x10($out) 860 pxor $inout1,$inout1 861 jmp .Lecb_ret 862 .align 16 863 .Lecb_dec_three: 864 call _aesni_decrypt3 865 movups $inout0,($out) # store 3 output blocks 866 pxor $inout0,$inout0 # clear register bank 867 movups $inout1,0x10($out) 868 pxor $inout1,$inout1 869 movups $inout2,0x20($out) 870 pxor $inout2,$inout2 871 jmp .Lecb_ret 872 .align 16 873 .Lecb_dec_four: 874 call _aesni_decrypt4 875 movups $inout0,($out) # store 4 output blocks 876 pxor $inout0,$inout0 # clear register bank 877 movups $inout1,0x10($out) 878 pxor $inout1,$inout1 879 movups $inout2,0x20($out) 880 pxor $inout2,$inout2 881 movups $inout3,0x30($out) 882 pxor $inout3,$inout3 883 jmp .Lecb_ret 884 .align 16 885 .Lecb_dec_five: 886 xorps $inout5,$inout5 887 call _aesni_decrypt6 888 movups $inout0,($out) # store 5 output blocks 889 pxor $inout0,$inout0 # clear register bank 890 movups $inout1,0x10($out) 891 pxor $inout1,$inout1 892 movups $inout2,0x20($out) 893 pxor $inout2,$inout2 894 movups $inout3,0x30($out) 895 pxor $inout3,$inout3 896 movups $inout4,0x40($out) 897 pxor $inout4,$inout4 898 pxor $inout5,$inout5 899 jmp .Lecb_ret 900 .align 16 901 .Lecb_dec_six: 902 call _aesni_decrypt6 903 movups $inout0,($out) # store 6 output blocks 904 pxor $inout0,$inout0 # clear register bank 905 movups $inout1,0x10($out) 906 pxor $inout1,$inout1 907 movups $inout2,0x20($out) 908 pxor $inout2,$inout2 909 movups $inout3,0x30($out) 910 pxor $inout3,$inout3 911 movups $inout4,0x40($out) 912 pxor $inout4,$inout4 913 movups $inout5,0x50($out) 914 pxor $inout5,$inout5 915 916 .Lecb_ret: 917 xorps $rndkey0,$rndkey0 # %xmm0 918 pxor $rndkey1,$rndkey1 919 ___ 920 $code.=<<___ if ($win64); 921 movaps (%rsp),%xmm6 922 movaps %xmm0,(%rsp) # clear stack 923 movaps 0x10(%rsp),%xmm7 924 movaps %xmm0,0x10(%rsp) 925 movaps 0x20(%rsp),%xmm8 926 movaps %xmm0,0x20(%rsp) 927 movaps 0x30(%rsp),%xmm9 928 movaps %xmm0,0x30(%rsp) 929 lea 0x58(%rsp),%rsp 930 .Lecb_enc_ret: 931 ___ 932 $code.=<<___; 933 ret 934 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt 935 ___ 936 938 { 939 ###################################################################### 940 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 941 # size_t blocks, const AES_KEY *key, 942 # const char *ivec,char *cmac); 943 # 944 # Handles only complete blocks, operates on 64-bit counter and 945 # does not update *ivec! Nor does it finalize CMAC value 946 # (see engine/eng_aesni.c for details) 947 # 948 { 949 my $cmac="%r9"; # 6th argument 950 951 my $increment="%xmm9"; 952 my $iv="%xmm6"; 953 my $bswap_mask="%xmm7"; 954 955 $code.=<<___; 956 .globl aesni_ccm64_encrypt_blocks 957 .type aesni_ccm64_encrypt_blocks,\@function,6 958 .align 16 959 aesni_ccm64_encrypt_blocks: 960 ___ 961 $code.=<<___ if ($win64); 962 lea -0x58(%rsp),%rsp 963 movaps %xmm6,(%rsp) # $iv 964 movaps %xmm7,0x10(%rsp) # $bswap_mask 965 movaps %xmm8,0x20(%rsp) # $in0 966 movaps %xmm9,0x30(%rsp) # $increment 967 .Lccm64_enc_body: 968 ___ 969 $code.=<<___; 970 mov 240($key),$rounds # key->rounds 971 movdqu ($ivp),$iv 972 movdqa .Lincrement64(%rip),$increment 973 movdqa .Lbswap_mask(%rip),$bswap_mask 974 975 shl \$4,$rounds 976 mov \$16,$rnds_ 977 lea 0($key),$key_ 978 movdqu ($cmac),$inout1 979 movdqa $iv,$inout0 980 lea 32($key,$rounds),$key # end of key schedule 981 pshufb $bswap_mask,$iv 982 sub %rax,%r10 # twisted $rounds 983 jmp .Lccm64_enc_outer 984 .align 16 985 .Lccm64_enc_outer: 986 $movkey ($key_),$rndkey0 987 mov %r10,%rax 988 movups ($inp),$in0 # load inp 989 990 xorps $rndkey0,$inout0 # counter 991 $movkey 16($key_),$rndkey1 992 xorps $in0,$rndkey0 993 xorps $rndkey0,$inout1 # cmac^=inp 994 $movkey 32($key_),$rndkey0 995 996 .Lccm64_enc2_loop: 997 aesenc $rndkey1,$inout0 998 aesenc $rndkey1,$inout1 999 $movkey ($key,%rax),$rndkey1 1000 add \$32,%rax 1001 aesenc $rndkey0,$inout0 1002 aesenc $rndkey0,$inout1 1003 $movkey -16($key,%rax),$rndkey0 1004 jnz .Lccm64_enc2_loop 1005 aesenc $rndkey1,$inout0 1006 aesenc $rndkey1,$inout1 1007 paddq $increment,$iv 1008 dec $len # $len-- ($len is in blocks) 1009 aesenclast $rndkey0,$inout0 1010 aesenclast $rndkey0,$inout1 1011 1012 lea 16($inp),$inp 1013 xorps $inout0,$in0 # inp ^= E(iv) 1014 movdqa $iv,$inout0 1015 movups $in0,($out) # save output 1016 pshufb $bswap_mask,$inout0 1017 lea 16($out),$out # $out+=16 1018 jnz .Lccm64_enc_outer # loop if ($len!=0) 1019 1020 pxor $rndkey0,$rndkey0 # clear register bank 1021 pxor $rndkey1,$rndkey1 1022 pxor $inout0,$inout0 1023 movups $inout1,($cmac) # store resulting mac 1024 pxor $inout1,$inout1 1025 pxor $in0,$in0 1026 pxor $iv,$iv 1027 ___ 1028 $code.=<<___ if ($win64); 1029 movaps (%rsp),%xmm6 1030 movaps %xmm0,(%rsp) # clear stack 1031 movaps 0x10(%rsp),%xmm7 1032 movaps %xmm0,0x10(%rsp) 1033 movaps 0x20(%rsp),%xmm8 1034 movaps %xmm0,0x20(%rsp) 1035 movaps 0x30(%rsp),%xmm9 1036 movaps %xmm0,0x30(%rsp) 1037 lea 0x58(%rsp),%rsp 1038 .Lccm64_enc_ret: 1039 ___ 1040 $code.=<<___; 1041 ret 1042 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1043 ___ 1044 ###################################################################### 1045 $code.=<<___; 1046 .globl aesni_ccm64_decrypt_blocks 1047 .type aesni_ccm64_decrypt_blocks,\@function,6 1048 .align 16 1049 aesni_ccm64_decrypt_blocks: 1050 ___ 1051 $code.=<<___ if ($win64); 1052 lea -0x58(%rsp),%rsp 1053 movaps %xmm6,(%rsp) # $iv 1054 movaps %xmm7,0x10(%rsp) # $bswap_mask 1055 movaps %xmm8,0x20(%rsp) # $in8 1056 movaps %xmm9,0x30(%rsp) # $increment 1057 .Lccm64_dec_body: 1058 ___ 1059 $code.=<<___; 1060 mov 240($key),$rounds # key->rounds 1061 movups ($ivp),$iv 1062 movdqu ($cmac),$inout1 1063 movdqa .Lincrement64(%rip),$increment 1064 movdqa .Lbswap_mask(%rip),$bswap_mask 1065 1066 movaps $iv,$inout0 1067 mov $rounds,$rnds_ 1068 mov $key,$key_ 1069 pshufb $bswap_mask,$iv 1070 ___ 1071 &aesni_generate1("enc",$key,$rounds); 1072 $code.=<<___; 1073 shl \$4,$rnds_ 1074 mov \$16,$rounds 1075 movups ($inp),$in0 # load inp 1076 paddq $increment,$iv 1077 lea 16($inp),$inp # $inp+=16 1078 sub %r10,%rax # twisted $rounds 1079 lea 32($key_,$rnds_),$key # end of key schedule 1080 mov %rax,%r10 1081 jmp .Lccm64_dec_outer 1082 .align 16 1083 .Lccm64_dec_outer: 1084 xorps $inout0,$in0 # inp ^= E(iv) 1085 movdqa $iv,$inout0 1086 movups $in0,($out) # save output 1087 lea 16($out),$out # $out+=16 1088 pshufb $bswap_mask,$inout0 1089 1090 sub \$1,$len # $len-- ($len is in blocks) 1091 jz .Lccm64_dec_break # if ($len==0) break 1092 1093 $movkey ($key_),$rndkey0 1094 mov %r10,%rax 1095 $movkey 16($key_),$rndkey1 1096 xorps $rndkey0,$in0 1097 xorps $rndkey0,$inout0 1098 xorps $in0,$inout1 # cmac^=out 1099 $movkey 32($key_),$rndkey0 1100 jmp .Lccm64_dec2_loop 1101 .align 16 1102 .Lccm64_dec2_loop: 1103 aesenc $rndkey1,$inout0 1104 aesenc $rndkey1,$inout1 1105 $movkey ($key,%rax),$rndkey1 1106 add \$32,%rax 1107 aesenc $rndkey0,$inout0 1108 aesenc $rndkey0,$inout1 1109 $movkey -16($key,%rax),$rndkey0 1110 jnz .Lccm64_dec2_loop 1111 movups ($inp),$in0 # load input 1112 paddq $increment,$iv 1113 aesenc $rndkey1,$inout0 1114 aesenc $rndkey1,$inout1 1115 aesenclast $rndkey0,$inout0 1116 aesenclast $rndkey0,$inout1 1117 lea 16($inp),$inp # $inp+=16 1118 jmp .Lccm64_dec_outer 1119 1120 .align 16 1121 .Lccm64_dec_break: 1122 #xorps $in0,$inout1 # cmac^=out 1123 mov 240($key_),$rounds 1124 ___ 1125 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1126 $code.=<<___; 1127 pxor $rndkey0,$rndkey0 # clear register bank 1128 pxor $rndkey1,$rndkey1 1129 pxor $inout0,$inout0 1130 movups $inout1,($cmac) # store resulting mac 1131 pxor $inout1,$inout1 1132 pxor $in0,$in0 1133 pxor $iv,$iv 1134 ___ 1135 $code.=<<___ if ($win64); 1136 movaps (%rsp),%xmm6 1137 movaps %xmm0,(%rsp) # clear stack 1138 movaps 0x10(%rsp),%xmm7 1139 movaps %xmm0,0x10(%rsp) 1140 movaps 0x20(%rsp),%xmm8 1141 movaps %xmm0,0x20(%rsp) 1142 movaps 0x30(%rsp),%xmm9 1143 movaps %xmm0,0x30(%rsp) 1144 lea 0x58(%rsp),%rsp 1145 .Lccm64_dec_ret: 1146 ___ 1147 $code.=<<___; 1148 ret 1149 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1150 ___ 1151 } 1153 ###################################################################### 1154 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1155 # size_t blocks, const AES_KEY *key, 1156 # const char *ivec); 1157 # 1158 # Handles only complete blocks, operates on 32-bit counter and 1159 # does not update *ivec! (see crypto/modes/ctr128.c for details) 1160 # 1161 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1162 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1163 # Keywords are full unroll and modulo-schedule counter calculations 1164 # with zero-round key xor. 1165 { 1166 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1167 my ($key0,$ctr)=("${key_}d","${ivp}d"); 1168 my $frame_size = 0x80 + ($win64?160:0); 1169 1170 $code.=<<___; 1171 .globl aesni_ctr32_encrypt_blocks 1172 .type aesni_ctr32_encrypt_blocks,\@function,5 1173 .align 16 1174 aesni_ctr32_encrypt_blocks: 1175 cmp \$1,$len 1176 jne .Lctr32_bulk 1177 1178 # handle single block without allocating stack frame, 1179 # useful when handling edges 1180 movups ($ivp),$inout0 1181 movups ($inp),$inout1 1182 mov 240($key),%edx # key->rounds 1183 ___ 1184 &aesni_generate1("enc",$key,"%edx"); 1185 $code.=<<___; 1186 pxor $rndkey0,$rndkey0 # clear register bank 1187 pxor $rndkey1,$rndkey1 1188 xorps $inout1,$inout0 1189 pxor $inout1,$inout1 1190 movups $inout0,($out) 1191 xorps $inout0,$inout0 1192 jmp .Lctr32_epilogue 1193 1194 .align 16 1195 .Lctr32_bulk: 1196 lea (%rsp),%rax 1197 push %rbp 1198 sub \$$frame_size,%rsp 1199 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1200 ___ 1201 $code.=<<___ if ($win64); 1202 movaps %xmm6,-0xa8(%rax) # offload everything 1203 movaps %xmm7,-0x98(%rax) 1204 movaps %xmm8,-0x88(%rax) 1205 movaps %xmm9,-0x78(%rax) 1206 movaps %xmm10,-0x68(%rax) 1207 movaps %xmm11,-0x58(%rax) 1208 movaps %xmm12,-0x48(%rax) 1209 movaps %xmm13,-0x38(%rax) 1210 movaps %xmm14,-0x28(%rax) 1211 movaps %xmm15,-0x18(%rax) 1212 .Lctr32_body: 1213 ___ 1214 $code.=<<___; 1215 lea -8(%rax),%rbp 1216 1217 # 8 16-byte words on top of stack are counter values 1218 # xor-ed with zero-round key 1219 1220 movdqu ($ivp),$inout0 1221 movdqu ($key),$rndkey0 1222 mov 12($ivp),$ctr # counter LSB 1223 pxor $rndkey0,$inout0 1224 mov 12($key),$key0 # 0-round key LSB 1225 movdqa $inout0,0x00(%rsp) # populate counter block 1226 bswap $ctr 1227 movdqa $inout0,$inout1 1228 movdqa $inout0,$inout2 1229 movdqa $inout0,$inout3 1230 movdqa $inout0,0x40(%rsp) 1231 movdqa $inout0,0x50(%rsp) 1232 movdqa $inout0,0x60(%rsp) 1233 mov %rdx,%r10 # about to borrow %rdx 1234 movdqa $inout0,0x70(%rsp) 1235 1236 lea 1($ctr),%rax 1237 lea 2($ctr),%rdx 1238 bswap %eax 1239 bswap %edx 1240 xor $key0,%eax 1241 xor $key0,%edx 1242 pinsrd \$3,%eax,$inout1 1243 lea 3($ctr),%rax 1244 movdqa $inout1,0x10(%rsp) 1245 pinsrd \$3,%edx,$inout2 1246 bswap %eax 1247 mov %r10,%rdx # restore %rdx 1248 lea 4($ctr),%r10 1249 movdqa $inout2,0x20(%rsp) 1250 xor $key0,%eax 1251 bswap %r10d 1252 pinsrd \$3,%eax,$inout3 1253 xor $key0,%r10d 1254 movdqa $inout3,0x30(%rsp) 1255 lea 5($ctr),%r9 1256 mov %r10d,0x40+12(%rsp) 1257 bswap %r9d 1258 lea 6($ctr),%r10 1259 mov 240($key),$rounds # key->rounds 1260 xor $key0,%r9d 1261 bswap %r10d 1262 mov %r9d,0x50+12(%rsp) 1263 xor $key0,%r10d 1264 lea 7($ctr),%r9 1265 mov %r10d,0x60+12(%rsp) 1266 bswap %r9d 1267 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1268 xor $key0,%r9d 1269 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1270 mov %r9d,0x70+12(%rsp) 1271 1272 $movkey 0x10($key),$rndkey1 1273 1274 movdqa 0x40(%rsp),$inout4 1275 movdqa 0x50(%rsp),$inout5 1276 1277 cmp \$8,$len # $len is in blocks 1278 jb .Lctr32_tail # short input if ($len<8) 1279 1280 sub \$6,$len # $len is biased by -6 1281 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1282 je .Lctr32_6x # [which denotes Atom Silvermont] 1283 1284 lea 0x80($key),$key # size optimization 1285 sub \$2,$len # $len is biased by -8 1286 jmp .Lctr32_loop8 1287 1288 .align 16 1289 .Lctr32_6x: 1290 shl \$4,$rounds 1291 mov \$48,$rnds_ 1292 bswap $key0 1293 lea 32($key,$rounds),$key # end of key schedule 1294 sub %rax,%r10 # twisted $rounds 1295 jmp .Lctr32_loop6 1296 1297 .align 16 1298 .Lctr32_loop6: 1299 add \$6,$ctr # next counter value 1300 $movkey -48($key,$rnds_),$rndkey0 1301 aesenc $rndkey1,$inout0 1302 mov $ctr,%eax 1303 xor $key0,%eax 1304 aesenc $rndkey1,$inout1 1305 movbe %eax,`0x00+12`(%rsp) # store next counter value 1306 lea 1($ctr),%eax 1307 aesenc $rndkey1,$inout2 1308 xor $key0,%eax 1309 movbe %eax,`0x10+12`(%rsp) 1310 aesenc $rndkey1,$inout3 1311 lea 2($ctr),%eax 1312 xor $key0,%eax 1313 aesenc $rndkey1,$inout4 1314 movbe %eax,`0x20+12`(%rsp) 1315 lea 3($ctr),%eax 1316 aesenc $rndkey1,$inout5 1317 $movkey -32($key,$rnds_),$rndkey1 1318 xor $key0,%eax 1319 1320 aesenc $rndkey0,$inout0 1321 movbe %eax,`0x30+12`(%rsp) 1322 lea 4($ctr),%eax 1323 aesenc $rndkey0,$inout1 1324 xor $key0,%eax 1325 movbe %eax,`0x40+12`(%rsp) 1326 aesenc $rndkey0,$inout2 1327 lea 5($ctr),%eax 1328 xor $key0,%eax 1329 aesenc $rndkey0,$inout3 1330 movbe %eax,`0x50+12`(%rsp) 1331 mov %r10,%rax # mov $rnds_,$rounds 1332 aesenc $rndkey0,$inout4 1333 aesenc $rndkey0,$inout5 1334 $movkey -16($key,$rnds_),$rndkey0 1335 1336 call .Lenc_loop6 1337 1338 movdqu ($inp),$inout6 # load 6 input blocks 1339 movdqu 0x10($inp),$inout7 1340 movdqu 0x20($inp),$in0 1341 movdqu 0x30($inp),$in1 1342 movdqu 0x40($inp),$in2 1343 movdqu 0x50($inp),$in3 1344 lea 0x60($inp),$inp # $inp+=6*16 1345 $movkey -64($key,$rnds_),$rndkey1 1346 pxor $inout0,$inout6 # inp^=E(ctr) 1347 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1348 pxor $inout1,$inout7 1349 movaps 0x10(%rsp),$inout1 1350 pxor $inout2,$in0 1351 movaps 0x20(%rsp),$inout2 1352 pxor $inout3,$in1 1353 movaps 0x30(%rsp),$inout3 1354 pxor $inout4,$in2 1355 movaps 0x40(%rsp),$inout4 1356 pxor $inout5,$in3 1357 movaps 0x50(%rsp),$inout5 1358 movdqu $inout6,($out) # store 6 output blocks 1359 movdqu $inout7,0x10($out) 1360 movdqu $in0,0x20($out) 1361 movdqu $in1,0x30($out) 1362 movdqu $in2,0x40($out) 1363 movdqu $in3,0x50($out) 1364 lea 0x60($out),$out # $out+=6*16 1365 1366 sub \$6,$len 1367 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1368 1369 add \$6,$len # restore real remaining $len 1370 jz .Lctr32_done # done if ($len==0) 1371 1372 lea -48($rnds_),$rounds 1373 lea -80($key,$rnds_),$key # restore $key 1374 neg $rounds 1375 shr \$4,$rounds # restore $rounds 1376 jmp .Lctr32_tail 1377 1378 .align 32 1379 .Lctr32_loop8: 1380 add \$8,$ctr # next counter value 1381 movdqa 0x60(%rsp),$inout6 1382 aesenc $rndkey1,$inout0 1383 mov $ctr,%r9d 1384 movdqa 0x70(%rsp),$inout7 1385 aesenc $rndkey1,$inout1 1386 bswap %r9d 1387 $movkey 0x20-0x80($key),$rndkey0 1388 aesenc $rndkey1,$inout2 1389 xor $key0,%r9d 1390 nop 1391 aesenc $rndkey1,$inout3 1392 mov %r9d,0x00+12(%rsp) # store next counter value 1393 lea 1($ctr),%r9 1394 aesenc $rndkey1,$inout4 1395 aesenc $rndkey1,$inout5 1396 aesenc $rndkey1,$inout6 1397 aesenc $rndkey1,$inout7 1398 $movkey 0x30-0x80($key),$rndkey1 1399 ___ 1400 for($i=2;$i<8;$i++) { 1401 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1402 $code.=<<___; 1403 bswap %r9d 1404 aesenc $rndkeyx,$inout0 1405 aesenc $rndkeyx,$inout1 1406 xor $key0,%r9d 1407 .byte 0x66,0x90 1408 aesenc $rndkeyx,$inout2 1409 aesenc $rndkeyx,$inout3 1410 mov %r9d,`0x10*($i-1)`+12(%rsp) 1411 lea $i($ctr),%r9 1412 aesenc $rndkeyx,$inout4 1413 aesenc $rndkeyx,$inout5 1414 aesenc $rndkeyx,$inout6 1415 aesenc $rndkeyx,$inout7 1416 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1417 ___ 1418 } 1419 $code.=<<___; 1420 bswap %r9d 1421 aesenc $rndkey0,$inout0 1422 aesenc $rndkey0,$inout1 1423 aesenc $rndkey0,$inout2 1424 xor $key0,%r9d 1425 movdqu 0x00($inp),$in0 # start loading input 1426 aesenc $rndkey0,$inout3 1427 mov %r9d,0x70+12(%rsp) 1428 cmp \$11,$rounds 1429 aesenc $rndkey0,$inout4 1430 aesenc $rndkey0,$inout5 1431 aesenc $rndkey0,$inout6 1432 aesenc $rndkey0,$inout7 1433 $movkey 0xa0-0x80($key),$rndkey0 1434 1435 jb .Lctr32_enc_done 1436 1437 aesenc $rndkey1,$inout0 1438 aesenc $rndkey1,$inout1 1439 aesenc $rndkey1,$inout2 1440 aesenc $rndkey1,$inout3 1441 aesenc $rndkey1,$inout4 1442 aesenc $rndkey1,$inout5 1443 aesenc $rndkey1,$inout6 1444 aesenc $rndkey1,$inout7 1445 $movkey 0xb0-0x80($key),$rndkey1 1446 1447 aesenc $rndkey0,$inout0 1448 aesenc $rndkey0,$inout1 1449 aesenc $rndkey0,$inout2 1450 aesenc $rndkey0,$inout3 1451 aesenc $rndkey0,$inout4 1452 aesenc $rndkey0,$inout5 1453 aesenc $rndkey0,$inout6 1454 aesenc $rndkey0,$inout7 1455 $movkey 0xc0-0x80($key),$rndkey0 1456 je .Lctr32_enc_done 1457 1458 aesenc $rndkey1,$inout0 1459 aesenc $rndkey1,$inout1 1460 aesenc $rndkey1,$inout2 1461 aesenc $rndkey1,$inout3 1462 aesenc $rndkey1,$inout4 1463 aesenc $rndkey1,$inout5 1464 aesenc $rndkey1,$inout6 1465 aesenc $rndkey1,$inout7 1466 $movkey 0xd0-0x80($key),$rndkey1 1467 1468 aesenc $rndkey0,$inout0 1469 aesenc $rndkey0,$inout1 1470 aesenc $rndkey0,$inout2 1471 aesenc $rndkey0,$inout3 1472 aesenc $rndkey0,$inout4 1473 aesenc $rndkey0,$inout5 1474 aesenc $rndkey0,$inout6 1475 aesenc $rndkey0,$inout7 1476 $movkey 0xe0-0x80($key),$rndkey0 1477 jmp .Lctr32_enc_done 1478 1479 .align 16 1480 .Lctr32_enc_done: 1481 movdqu 0x10($inp),$in1 1482 pxor $rndkey0,$in0 # input^=round[last] 1483 movdqu 0x20($inp),$in2 1484 pxor $rndkey0,$in1 1485 movdqu 0x30($inp),$in3 1486 pxor $rndkey0,$in2 1487 movdqu 0x40($inp),$in4 1488 pxor $rndkey0,$in3 1489 movdqu 0x50($inp),$in5 1490 pxor $rndkey0,$in4 1491 pxor $rndkey0,$in5 1492 aesenc $rndkey1,$inout0 1493 aesenc $rndkey1,$inout1 1494 aesenc $rndkey1,$inout2 1495 aesenc $rndkey1,$inout3 1496 aesenc $rndkey1,$inout4 1497 aesenc $rndkey1,$inout5 1498 aesenc $rndkey1,$inout6 1499 aesenc $rndkey1,$inout7 1500 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1501 lea 0x80($inp),$inp # $inp+=8*16 1502 1503 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1504 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1505 movdqu 0x70-0x80($inp),$in0 1506 aesenclast $in1,$inout1 1507 pxor $rndkey0,$in0 1508 movdqa 0x00(%rsp),$in1 # load next counter block 1509 aesenclast $in2,$inout2 1510 aesenclast $in3,$inout3 1511 movdqa 0x10(%rsp),$in2 1512 movdqa 0x20(%rsp),$in3 1513 aesenclast $in4,$inout4 1514 aesenclast $in5,$inout5 1515 movdqa 0x30(%rsp),$in4 1516 movdqa 0x40(%rsp),$in5 1517 aesenclast $rndkey1,$inout6 1518 movdqa 0x50(%rsp),$rndkey0 1519 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1520 aesenclast $in0,$inout7 1521 1522 movups $inout0,($out) # store 8 output blocks 1523 movdqa $in1,$inout0 1524 movups $inout1,0x10($out) 1525 movdqa $in2,$inout1 1526 movups $inout2,0x20($out) 1527 movdqa $in3,$inout2 1528 movups $inout3,0x30($out) 1529 movdqa $in4,$inout3 1530 movups $inout4,0x40($out) 1531 movdqa $in5,$inout4 1532 movups $inout5,0x50($out) 1533 movdqa $rndkey0,$inout5 1534 movups $inout6,0x60($out) 1535 movups $inout7,0x70($out) 1536 lea 0x80($out),$out # $out+=8*16 1537 1538 sub \$8,$len 1539 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1540 1541 add \$8,$len # restore real remainig $len 1542 jz .Lctr32_done # done if ($len==0) 1543 lea -0x80($key),$key 1544 1545 .Lctr32_tail: 1546 # note that at this point $inout0..5 are populated with 1547 # counter values xor-ed with 0-round key 1548 lea 16($key),$key 1549 cmp \$4,$len 1550 jb .Lctr32_loop3 1551 je .Lctr32_loop4 1552 1553 # if ($len>4) compute 7 E(counter) 1554 shl \$4,$rounds 1555 movdqa 0x60(%rsp),$inout6 1556 pxor $inout7,$inout7 1557 1558 $movkey 16($key),$rndkey0 1559 aesenc $rndkey1,$inout0 1560 aesenc $rndkey1,$inout1 1561 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1562 neg %rax 1563 aesenc $rndkey1,$inout2 1564 add \$16,%rax # prepare for .Lenc_loop8_enter 1565 movups ($inp),$in0 1566 aesenc $rndkey1,$inout3 1567 aesenc $rndkey1,$inout4 1568 movups 0x10($inp),$in1 # pre-load input 1569 movups 0x20($inp),$in2 1570 aesenc $rndkey1,$inout5 1571 aesenc $rndkey1,$inout6 1572 1573 call .Lenc_loop8_enter 1574 1575 movdqu 0x30($inp),$in3 1576 pxor $in0,$inout0 1577 movdqu 0x40($inp),$in0 1578 pxor $in1,$inout1 1579 movdqu $inout0,($out) # store output 1580 pxor $in2,$inout2 1581 movdqu $inout1,0x10($out) 1582 pxor $in3,$inout3 1583 movdqu $inout2,0x20($out) 1584 pxor $in0,$inout4 1585 movdqu $inout3,0x30($out) 1586 movdqu $inout4,0x40($out) 1587 cmp \$6,$len 1588 jb .Lctr32_done # $len was 5, stop store 1589 1590 movups 0x50($inp),$in1 1591 xorps $in1,$inout5 1592 movups $inout5,0x50($out) 1593 je .Lctr32_done # $len was 6, stop store 1594 1595 movups 0x60($inp),$in2 1596 xorps $in2,$inout6 1597 movups $inout6,0x60($out) 1598 jmp .Lctr32_done # $len was 7, stop store 1599 1600 .align 32 1601 .Lctr32_loop4: 1602 aesenc $rndkey1,$inout0 1603 lea 16($key),$key 1604 dec $rounds 1605 aesenc $rndkey1,$inout1 1606 aesenc $rndkey1,$inout2 1607 aesenc $rndkey1,$inout3 1608 $movkey ($key),$rndkey1 1609 jnz .Lctr32_loop4 1610 aesenclast $rndkey1,$inout0 1611 aesenclast $rndkey1,$inout1 1612 movups ($inp),$in0 # load input 1613 movups 0x10($inp),$in1 1614 aesenclast $rndkey1,$inout2 1615 aesenclast $rndkey1,$inout3 1616 movups 0x20($inp),$in2 1617 movups 0x30($inp),$in3 1618 1619 xorps $in0,$inout0 1620 movups $inout0,($out) # store output 1621 xorps $in1,$inout1 1622 movups $inout1,0x10($out) 1623 pxor $in2,$inout2 1624 movdqu $inout2,0x20($out) 1625 pxor $in3,$inout3 1626 movdqu $inout3,0x30($out) 1627 jmp .Lctr32_done # $len was 4, stop store 1628 1629 .align 32 1630 .Lctr32_loop3: 1631 aesenc $rndkey1,$inout0 1632 lea 16($key),$key 1633 dec $rounds 1634 aesenc $rndkey1,$inout1 1635 aesenc $rndkey1,$inout2 1636 $movkey ($key),$rndkey1 1637 jnz .Lctr32_loop3 1638 aesenclast $rndkey1,$inout0 1639 aesenclast $rndkey1,$inout1 1640 aesenclast $rndkey1,$inout2 1641 1642 movups ($inp),$in0 # load input 1643 xorps $in0,$inout0 1644 movups $inout0,($out) # store output 1645 cmp \$2,$len 1646 jb .Lctr32_done # $len was 1, stop store 1647 1648 movups 0x10($inp),$in1 1649 xorps $in1,$inout1 1650 movups $inout1,0x10($out) 1651 je .Lctr32_done # $len was 2, stop store 1652 1653 movups 0x20($inp),$in2 1654 xorps $in2,$inout2 1655 movups $inout2,0x20($out) # $len was 3, stop store 1656 1657 .Lctr32_done: 1658 xorps %xmm0,%xmm0 # clear regiser bank 1659 xor $key0,$key0 1660 pxor %xmm1,%xmm1 1661 pxor %xmm2,%xmm2 1662 pxor %xmm3,%xmm3 1663 pxor %xmm4,%xmm4 1664 pxor %xmm5,%xmm5 1665 ___ 1666 $code.=<<___ if (!$win64); 1667 pxor %xmm6,%xmm6 1668 pxor %xmm7,%xmm7 1669 movaps %xmm0,0x00(%rsp) # clear stack 1670 pxor %xmm8,%xmm8 1671 movaps %xmm0,0x10(%rsp) 1672 pxor %xmm9,%xmm9 1673 movaps %xmm0,0x20(%rsp) 1674 pxor %xmm10,%xmm10 1675 movaps %xmm0,0x30(%rsp) 1676 pxor %xmm11,%xmm11 1677 movaps %xmm0,0x40(%rsp) 1678 pxor %xmm12,%xmm12 1679 movaps %xmm0,0x50(%rsp) 1680 pxor %xmm13,%xmm13 1681 movaps %xmm0,0x60(%rsp) 1682 pxor %xmm14,%xmm14 1683 movaps %xmm0,0x70(%rsp) 1684 pxor %xmm15,%xmm15 1685 ___ 1686 $code.=<<___ if ($win64); 1687 movaps -0xa0(%rbp),%xmm6 1688 movaps %xmm0,-0xa0(%rbp) # clear stack 1689 movaps -0x90(%rbp),%xmm7 1690 movaps %xmm0,-0x90(%rbp) 1691 movaps -0x80(%rbp),%xmm8 1692 movaps %xmm0,-0x80(%rbp) 1693 movaps -0x70(%rbp),%xmm9 1694 movaps %xmm0,-0x70(%rbp) 1695 movaps -0x60(%rbp),%xmm10 1696 movaps %xmm0,-0x60(%rbp) 1697 movaps -0x50(%rbp),%xmm11 1698 movaps %xmm0,-0x50(%rbp) 1699 movaps -0x40(%rbp),%xmm12 1700 movaps %xmm0,-0x40(%rbp) 1701 movaps -0x30(%rbp),%xmm13 1702 movaps %xmm0,-0x30(%rbp) 1703 movaps -0x20(%rbp),%xmm14 1704 movaps %xmm0,-0x20(%rbp) 1705 movaps -0x10(%rbp),%xmm15 1706 movaps %xmm0,-0x10(%rbp) 1707 movaps %xmm0,0x00(%rsp) 1708 movaps %xmm0,0x10(%rsp) 1709 movaps %xmm0,0x20(%rsp) 1710 movaps %xmm0,0x30(%rsp) 1711 movaps %xmm0,0x40(%rsp) 1712 movaps %xmm0,0x50(%rsp) 1713 movaps %xmm0,0x60(%rsp) 1714 movaps %xmm0,0x70(%rsp) 1715 ___ 1716 $code.=<<___; 1717 lea (%rbp),%rsp 1718 pop %rbp 1719 .Lctr32_epilogue: 1720 ret 1721 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1722 ___ 1723 } 1724 1726 ###################################################################### 1727 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1728 # const AES_KEY *key1, const AES_KEY *key2 1729 # const unsigned char iv[16]); 1730 # 1731 { 1732 my @tweak=map("%xmm$_",(10..15)); 1733 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1734 my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1735 my $frame_size = 0x70 + ($win64?160:0); 1736 1737 $code.=<<___; 1738 .globl aesni_xts_encrypt 1739 .type aesni_xts_encrypt,\@function,6 1740 .align 16 1741 aesni_xts_encrypt: 1742 lea (%rsp),%rax 1743 push %rbp 1744 sub \$$frame_size,%rsp 1745 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1746 ___ 1747 $code.=<<___ if ($win64); 1748 movaps %xmm6,-0xa8(%rax) # offload everything 1749 movaps %xmm7,-0x98(%rax) 1750 movaps %xmm8,-0x88(%rax) 1751 movaps %xmm9,-0x78(%rax) 1752 movaps %xmm10,-0x68(%rax) 1753 movaps %xmm11,-0x58(%rax) 1754 movaps %xmm12,-0x48(%rax) 1755 movaps %xmm13,-0x38(%rax) 1756 movaps %xmm14,-0x28(%rax) 1757 movaps %xmm15,-0x18(%rax) 1758 .Lxts_enc_body: 1759 ___ 1760 $code.=<<___; 1761 lea -8(%rax),%rbp 1762 movups ($ivp),$inout0 # load clear-text tweak 1763 mov 240(%r8),$rounds # key2->rounds 1764 mov 240($key),$rnds_ # key1->rounds 1765 ___ 1766 # generate the tweak 1767 &aesni_generate1("enc",$key2,$rounds,$inout0); 1768 $code.=<<___; 1769 $movkey ($key),$rndkey0 # zero round key 1770 mov $key,$key_ # backup $key 1771 mov $rnds_,$rounds # backup $rounds 1772 shl \$4,$rnds_ 1773 mov $len,$len_ # backup $len 1774 and \$-16,$len 1775 1776 $movkey 16($key,$rnds_),$rndkey1 # last round key 1777 1778 movdqa .Lxts_magic(%rip),$twmask 1779 movdqa $inout0,@tweak[5] 1780 pshufd \$0x5f,$inout0,$twres 1781 pxor $rndkey0,$rndkey1 1782 ___ 1783 # alternative tweak calculation algorithm is based on suggestions 1784 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1785 # and should help in the future... 1786 for ($i=0;$i<4;$i++) { 1787 $code.=<<___; 1788 movdqa $twres,$twtmp 1789 paddd $twres,$twres 1790 movdqa @tweak[5],@tweak[$i] 1791 psrad \$31,$twtmp # broadcast upper bits 1792 paddq @tweak[5],@tweak[5] 1793 pand $twmask,$twtmp 1794 pxor $rndkey0,@tweak[$i] 1795 pxor $twtmp,@tweak[5] 1796 ___ 1797 } 1798 $code.=<<___; 1799 movdqa @tweak[5],@tweak[4] 1800 psrad \$31,$twres 1801 paddq @tweak[5],@tweak[5] 1802 pand $twmask,$twres 1803 pxor $rndkey0,@tweak[4] 1804 pxor $twres,@tweak[5] 1805 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1806 1807 sub \$16*6,$len 1808 jc .Lxts_enc_short # if $len-=6*16 borrowed 1809 1810 mov \$16+96,$rounds 1811 lea 32($key_,$rnds_),$key # end of key schedule 1812 sub %r10,%rax # twisted $rounds 1813 $movkey 16($key_),$rndkey1 1814 mov %rax,%r10 # backup twisted $rounds 1815 lea .Lxts_magic(%rip),%r8 1816 jmp .Lxts_enc_grandloop 1817 1818 .align 32 1819 .Lxts_enc_grandloop: 1820 movdqu `16*0`($inp),$inout0 # load input 1821 movdqa $rndkey0,$twmask 1822 movdqu `16*1`($inp),$inout1 1823 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1824 movdqu `16*2`($inp),$inout2 1825 pxor @tweak[1],$inout1 1826 aesenc $rndkey1,$inout0 1827 movdqu `16*3`($inp),$inout3 1828 pxor @tweak[2],$inout2 1829 aesenc $rndkey1,$inout1 1830 movdqu `16*4`($inp),$inout4 1831 pxor @tweak[3],$inout3 1832 aesenc $rndkey1,$inout2 1833 movdqu `16*5`($inp),$inout5 1834 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1835 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1836 pxor @tweak[4],$inout4 1837 aesenc $rndkey1,$inout3 1838 $movkey 32($key_),$rndkey0 1839 lea `16*6`($inp),$inp 1840 pxor $twmask,$inout5 1841 1842 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 1843 aesenc $rndkey1,$inout4 1844 pxor $twres,@tweak[1] 1845 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1846 aesenc $rndkey1,$inout5 1847 $movkey 48($key_),$rndkey1 1848 pxor $twres,@tweak[2] 1849 1850 aesenc $rndkey0,$inout0 1851 pxor $twres,@tweak[3] 1852 movdqa @tweak[1],`16*1`(%rsp) 1853 aesenc $rndkey0,$inout1 1854 pxor $twres,@tweak[4] 1855 movdqa @tweak[2],`16*2`(%rsp) 1856 aesenc $rndkey0,$inout2 1857 aesenc $rndkey0,$inout3 1858 pxor $twres,$twmask 1859 movdqa @tweak[4],`16*4`(%rsp) 1860 aesenc $rndkey0,$inout4 1861 aesenc $rndkey0,$inout5 1862 $movkey 64($key_),$rndkey0 1863 movdqa $twmask,`16*5`(%rsp) 1864 pshufd \$0x5f,@tweak[5],$twres 1865 jmp .Lxts_enc_loop6 1866 .align 32 1867 .Lxts_enc_loop6: 1868 aesenc $rndkey1,$inout0 1869 aesenc $rndkey1,$inout1 1870 aesenc $rndkey1,$inout2 1871 aesenc $rndkey1,$inout3 1872 aesenc $rndkey1,$inout4 1873 aesenc $rndkey1,$inout5 1874 $movkey -64($key,%rax),$rndkey1 1875 add \$32,%rax 1876 1877 aesenc $rndkey0,$inout0 1878 aesenc $rndkey0,$inout1 1879 aesenc $rndkey0,$inout2 1880 aesenc $rndkey0,$inout3 1881 aesenc $rndkey0,$inout4 1882 aesenc $rndkey0,$inout5 1883 $movkey -80($key,%rax),$rndkey0 1884 jnz .Lxts_enc_loop6 1885 1886 movdqa (%r8),$twmask # start calculating next tweak 1887 movdqa $twres,$twtmp 1888 paddd $twres,$twres 1889 aesenc $rndkey1,$inout0 1890 paddq @tweak[5],@tweak[5] 1891 psrad \$31,$twtmp 1892 aesenc $rndkey1,$inout1 1893 pand $twmask,$twtmp 1894 $movkey ($key_),@tweak[0] # load round[0] 1895 aesenc $rndkey1,$inout2 1896 aesenc $rndkey1,$inout3 1897 aesenc $rndkey1,$inout4 1898 pxor $twtmp,@tweak[5] 1899 movaps @tweak[0],@tweak[1] # copy round[0] 1900 aesenc $rndkey1,$inout5 1901 $movkey -64($key),$rndkey1 1902 1903 movdqa $twres,$twtmp 1904 aesenc $rndkey0,$inout0 1905 paddd $twres,$twres 1906 pxor @tweak[5],@tweak[0] 1907 aesenc $rndkey0,$inout1 1908 psrad \$31,$twtmp 1909 paddq @tweak[5],@tweak[5] 1910 aesenc $rndkey0,$inout2 1911 aesenc $rndkey0,$inout3 1912 pand $twmask,$twtmp 1913 movaps @tweak[1],@tweak[2] 1914 aesenc $rndkey0,$inout4 1915 pxor $twtmp,@tweak[5] 1916 movdqa $twres,$twtmp 1917 aesenc $rndkey0,$inout5 1918 $movkey -48($key),$rndkey0 1919 1920 paddd $twres,$twres 1921 aesenc $rndkey1,$inout0 1922 pxor @tweak[5],@tweak[1] 1923 psrad \$31,$twtmp 1924 aesenc $rndkey1,$inout1 1925 paddq @tweak[5],@tweak[5] 1926 pand $twmask,$twtmp 1927 aesenc $rndkey1,$inout2 1928 aesenc $rndkey1,$inout3 1929 movdqa @tweak[3],`16*3`(%rsp) 1930 pxor $twtmp,@tweak[5] 1931 aesenc $rndkey1,$inout4 1932 movaps @tweak[2],@tweak[3] 1933 movdqa $twres,$twtmp 1934 aesenc $rndkey1,$inout5 1935 $movkey -32($key),$rndkey1 1936 1937 paddd $twres,$twres 1938 aesenc $rndkey0,$inout0 1939 pxor @tweak[5],@tweak[2] 1940 psrad \$31,$twtmp 1941 aesenc $rndkey0,$inout1 1942 paddq @tweak[5],@tweak[5] 1943 pand $twmask,$twtmp 1944 aesenc $rndkey0,$inout2 1945 aesenc $rndkey0,$inout3 1946 aesenc $rndkey0,$inout4 1947 pxor $twtmp,@tweak[5] 1948 movaps @tweak[3],@tweak[4] 1949 aesenc $rndkey0,$inout5 1950 1951 movdqa $twres,$rndkey0 1952 paddd $twres,$twres 1953 aesenc $rndkey1,$inout0 1954 pxor @tweak[5],@tweak[3] 1955 psrad \$31,$rndkey0 1956 aesenc $rndkey1,$inout1 1957 paddq @tweak[5],@tweak[5] 1958 pand $twmask,$rndkey0 1959 aesenc $rndkey1,$inout2 1960 aesenc $rndkey1,$inout3 1961 pxor $rndkey0,@tweak[5] 1962 $movkey ($key_),$rndkey0 1963 aesenc $rndkey1,$inout4 1964 aesenc $rndkey1,$inout5 1965 $movkey 16($key_),$rndkey1 1966 1967 pxor @tweak[5],@tweak[4] 1968 aesenclast `16*0`(%rsp),$inout0 1969 psrad \$31,$twres 1970 paddq @tweak[5],@tweak[5] 1971 aesenclast `16*1`(%rsp),$inout1 1972 aesenclast `16*2`(%rsp),$inout2 1973 pand $twmask,$twres 1974 mov %r10,%rax # restore $rounds 1975 aesenclast `16*3`(%rsp),$inout3 1976 aesenclast `16*4`(%rsp),$inout4 1977 aesenclast `16*5`(%rsp),$inout5 1978 pxor $twres,@tweak[5] 1979 1980 lea `16*6`($out),$out # $out+=6*16 1981 movups $inout0,`-16*6`($out) # store 6 output blocks 1982 movups $inout1,`-16*5`($out) 1983 movups $inout2,`-16*4`($out) 1984 movups $inout3,`-16*3`($out) 1985 movups $inout4,`-16*2`($out) 1986 movups $inout5,`-16*1`($out) 1987 sub \$16*6,$len 1988 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 1989 1990 mov \$16+96,$rounds 1991 sub $rnds_,$rounds 1992 mov $key_,$key # restore $key 1993 shr \$4,$rounds # restore original value 1994 1995 .Lxts_enc_short: 1996 # at the point @tweak[0..5] are populated with tweak values 1997 mov $rounds,$rnds_ # backup $rounds 1998 pxor $rndkey0,@tweak[0] 1999 add \$16*6,$len # restore real remaining $len 2000 jz .Lxts_enc_done # done if ($len==0) 2001 2002 pxor $rndkey0,@tweak[1] 2003 cmp \$0x20,$len 2004 jb .Lxts_enc_one # $len is 1*16 2005 pxor $rndkey0,@tweak[2] 2006 je .Lxts_enc_two # $len is 2*16 2007 2008 pxor $rndkey0,@tweak[3] 2009 cmp \$0x40,$len 2010 jb .Lxts_enc_three # $len is 3*16 2011 pxor $rndkey0,@tweak[4] 2012 je .Lxts_enc_four # $len is 4*16 2013 2014 movdqu ($inp),$inout0 # $len is 5*16 2015 movdqu 16*1($inp),$inout1 2016 movdqu 16*2($inp),$inout2 2017 pxor @tweak[0],$inout0 2018 movdqu 16*3($inp),$inout3 2019 pxor @tweak[1],$inout1 2020 movdqu 16*4($inp),$inout4 2021 lea 16*5($inp),$inp # $inp+=5*16 2022 pxor @tweak[2],$inout2 2023 pxor @tweak[3],$inout3 2024 pxor @tweak[4],$inout4 2025 pxor $inout5,$inout5 2026 2027 call _aesni_encrypt6 2028 2029 xorps @tweak[0],$inout0 2030 movdqa @tweak[5],@tweak[0] 2031 xorps @tweak[1],$inout1 2032 xorps @tweak[2],$inout2 2033 movdqu $inout0,($out) # store 5 output blocks 2034 xorps @tweak[3],$inout3 2035 movdqu $inout1,16*1($out) 2036 xorps @tweak[4],$inout4 2037 movdqu $inout2,16*2($out) 2038 movdqu $inout3,16*3($out) 2039 movdqu $inout4,16*4($out) 2040 lea 16*5($out),$out # $out+=5*16 2041 jmp .Lxts_enc_done 2042 2043 .align 16 2044 .Lxts_enc_one: 2045 movups ($inp),$inout0 2046 lea 16*1($inp),$inp # inp+=1*16 2047 xorps @tweak[0],$inout0 2048 ___ 2049 &aesni_generate1("enc",$key,$rounds); 2050 $code.=<<___; 2051 xorps @tweak[0],$inout0 2052 movdqa @tweak[1],@tweak[0] 2053 movups $inout0,($out) # store one output block 2054 lea 16*1($out),$out # $out+=1*16 2055 jmp .Lxts_enc_done 2056 2057 .align 16 2058 .Lxts_enc_two: 2059 movups ($inp),$inout0 2060 movups 16($inp),$inout1 2061 lea 32($inp),$inp # $inp+=2*16 2062 xorps @tweak[0],$inout0 2063 xorps @tweak[1],$inout1 2064 2065 call _aesni_encrypt2 2066 2067 xorps @tweak[0],$inout0 2068 movdqa @tweak[2],@tweak[0] 2069 xorps @tweak[1],$inout1 2070 movups $inout0,($out) # store 2 output blocks 2071 movups $inout1,16*1($out) 2072 lea 16*2($out),$out # $out+=2*16 2073 jmp .Lxts_enc_done 2074 2075 .align 16 2076 .Lxts_enc_three: 2077 movups ($inp),$inout0 2078 movups 16*1($inp),$inout1 2079 movups 16*2($inp),$inout2 2080 lea 16*3($inp),$inp # $inp+=3*16 2081 xorps @tweak[0],$inout0 2082 xorps @tweak[1],$inout1 2083 xorps @tweak[2],$inout2 2084 2085 call _aesni_encrypt3 2086 2087 xorps @tweak[0],$inout0 2088 movdqa @tweak[3],@tweak[0] 2089 xorps @tweak[1],$inout1 2090 xorps @tweak[2],$inout2 2091 movups $inout0,($out) # store 3 output blocks 2092 movups $inout1,16*1($out) 2093 movups $inout2,16*2($out) 2094 lea 16*3($out),$out # $out+=3*16 2095 jmp .Lxts_enc_done 2096 2097 .align 16 2098 .Lxts_enc_four: 2099 movups ($inp),$inout0 2100 movups 16*1($inp),$inout1 2101 movups 16*2($inp),$inout2 2102 xorps @tweak[0],$inout0 2103 movups 16*3($inp),$inout3 2104 lea 16*4($inp),$inp # $inp+=4*16 2105 xorps @tweak[1],$inout1 2106 xorps @tweak[2],$inout2 2107 xorps @tweak[3],$inout3 2108 2109 call _aesni_encrypt4 2110 2111 pxor @tweak[0],$inout0 2112 movdqa @tweak[4],@tweak[0] 2113 pxor @tweak[1],$inout1 2114 pxor @tweak[2],$inout2 2115 movdqu $inout0,($out) # store 4 output blocks 2116 pxor @tweak[3],$inout3 2117 movdqu $inout1,16*1($out) 2118 movdqu $inout2,16*2($out) 2119 movdqu $inout3,16*3($out) 2120 lea 16*4($out),$out # $out+=4*16 2121 jmp .Lxts_enc_done 2122 2123 .align 16 2124 .Lxts_enc_done: 2125 and \$15,$len_ # see if $len%16 is 0 2126 jz .Lxts_enc_ret 2127 mov $len_,$len 2128 2129 .Lxts_enc_steal: 2130 movzb ($inp),%eax # borrow $rounds ... 2131 movzb -16($out),%ecx # ... and $key 2132 lea 1($inp),$inp 2133 mov %al,-16($out) 2134 mov %cl,0($out) 2135 lea 1($out),$out 2136 sub \$1,$len 2137 jnz .Lxts_enc_steal 2138 2139 sub $len_,$out # rewind $out 2140 mov $key_,$key # restore $key 2141 mov $rnds_,$rounds # restore $rounds 2142 2143 movups -16($out),$inout0 2144 xorps @tweak[0],$inout0 2145 ___ 2146 &aesni_generate1("enc",$key,$rounds); 2147 $code.=<<___; 2148 xorps @tweak[0],$inout0 2149 movups $inout0,-16($out) 2150 2151 .Lxts_enc_ret: 2152 xorps %xmm0,%xmm0 # clear register bank 2153 pxor %xmm1,%xmm1 2154 pxor %xmm2,%xmm2 2155 pxor %xmm3,%xmm3 2156 pxor %xmm4,%xmm4 2157 pxor %xmm5,%xmm5 2158 ___ 2159 $code.=<<___ if (!$win64); 2160 pxor %xmm6,%xmm6 2161 pxor %xmm7,%xmm7 2162 movaps %xmm0,0x00(%rsp) # clear stack 2163 pxor %xmm8,%xmm8 2164 movaps %xmm0,0x10(%rsp) 2165 pxor %xmm9,%xmm9 2166 movaps %xmm0,0x20(%rsp) 2167 pxor %xmm10,%xmm10 2168 movaps %xmm0,0x30(%rsp) 2169 pxor %xmm11,%xmm11 2170 movaps %xmm0,0x40(%rsp) 2171 pxor %xmm12,%xmm12 2172 movaps %xmm0,0x50(%rsp) 2173 pxor %xmm13,%xmm13 2174 movaps %xmm0,0x60(%rsp) 2175 pxor %xmm14,%xmm14 2176 pxor %xmm15,%xmm15 2177 ___ 2178 $code.=<<___ if ($win64); 2179 movaps -0xa0(%rbp),%xmm6 2180 movaps %xmm0,-0xa0(%rbp) # clear stack 2181 movaps -0x90(%rbp),%xmm7 2182 movaps %xmm0,-0x90(%rbp) 2183 movaps -0x80(%rbp),%xmm8 2184 movaps %xmm0,-0x80(%rbp) 2185 movaps -0x70(%rbp),%xmm9 2186 movaps %xmm0,-0x70(%rbp) 2187 movaps -0x60(%rbp),%xmm10 2188 movaps %xmm0,-0x60(%rbp) 2189 movaps -0x50(%rbp),%xmm11 2190 movaps %xmm0,-0x50(%rbp) 2191 movaps -0x40(%rbp),%xmm12 2192 movaps %xmm0,-0x40(%rbp) 2193 movaps -0x30(%rbp),%xmm13 2194 movaps %xmm0,-0x30(%rbp) 2195 movaps -0x20(%rbp),%xmm14 2196 movaps %xmm0,-0x20(%rbp) 2197 movaps -0x10(%rbp),%xmm15 2198 movaps %xmm0,-0x10(%rbp) 2199 movaps %xmm0,0x00(%rsp) 2200 movaps %xmm0,0x10(%rsp) 2201 movaps %xmm0,0x20(%rsp) 2202 movaps %xmm0,0x30(%rsp) 2203 movaps %xmm0,0x40(%rsp) 2204 movaps %xmm0,0x50(%rsp) 2205 movaps %xmm0,0x60(%rsp) 2206 ___ 2207 $code.=<<___; 2208 lea (%rbp),%rsp 2209 pop %rbp 2210 .Lxts_enc_epilogue: 2211 ret 2212 .size aesni_xts_encrypt,.-aesni_xts_encrypt 2213 ___ 2214 2215 $code.=<<___; 2216 .globl aesni_xts_decrypt 2217 .type aesni_xts_decrypt,\@function,6 2218 .align 16 2219 aesni_xts_decrypt: 2220 lea (%rsp),%rax 2221 push %rbp 2222 sub \$$frame_size,%rsp 2223 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2224 ___ 2225 $code.=<<___ if ($win64); 2226 movaps %xmm6,-0xa8(%rax) # offload everything 2227 movaps %xmm7,-0x98(%rax) 2228 movaps %xmm8,-0x88(%rax) 2229 movaps %xmm9,-0x78(%rax) 2230 movaps %xmm10,-0x68(%rax) 2231 movaps %xmm11,-0x58(%rax) 2232 movaps %xmm12,-0x48(%rax) 2233 movaps %xmm13,-0x38(%rax) 2234 movaps %xmm14,-0x28(%rax) 2235 movaps %xmm15,-0x18(%rax) 2236 .Lxts_dec_body: 2237 ___ 2238 $code.=<<___; 2239 lea -8(%rax),%rbp 2240 movups ($ivp),$inout0 # load clear-text tweak 2241 mov 240($key2),$rounds # key2->rounds 2242 mov 240($key),$rnds_ # key1->rounds 2243 ___ 2244 # generate the tweak 2245 &aesni_generate1("enc",$key2,$rounds,$inout0); 2246 $code.=<<___; 2247 xor %eax,%eax # if ($len%16) len-=16; 2248 test \$15,$len 2249 setnz %al 2250 shl \$4,%rax 2251 sub %rax,$len 2252 2253 $movkey ($key),$rndkey0 # zero round key 2254 mov $key,$key_ # backup $key 2255 mov $rnds_,$rounds # backup $rounds 2256 shl \$4,$rnds_ 2257 mov $len,$len_ # backup $len 2258 and \$-16,$len 2259 2260 $movkey 16($key,$rnds_),$rndkey1 # last round key 2261 2262 movdqa .Lxts_magic(%rip),$twmask 2263 movdqa $inout0,@tweak[5] 2264 pshufd \$0x5f,$inout0,$twres 2265 pxor $rndkey0,$rndkey1 2266 ___ 2267 for ($i=0;$i<4;$i++) { 2268 $code.=<<___; 2269 movdqa $twres,$twtmp 2270 paddd $twres,$twres 2271 movdqa @tweak[5],@tweak[$i] 2272 psrad \$31,$twtmp # broadcast upper bits 2273 paddq @tweak[5],@tweak[5] 2274 pand $twmask,$twtmp 2275 pxor $rndkey0,@tweak[$i] 2276 pxor $twtmp,@tweak[5] 2277 ___ 2278 } 2279 $code.=<<___; 2280 movdqa @tweak[5],@tweak[4] 2281 psrad \$31,$twres 2282 paddq @tweak[5],@tweak[5] 2283 pand $twmask,$twres 2284 pxor $rndkey0,@tweak[4] 2285 pxor $twres,@tweak[5] 2286 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2287 2288 sub \$16*6,$len 2289 jc .Lxts_dec_short # if $len-=6*16 borrowed 2290 2291 mov \$16+96,$rounds 2292 lea 32($key_,$rnds_),$key # end of key schedule 2293 sub %r10,%rax # twisted $rounds 2294 $movkey 16($key_),$rndkey1 2295 mov %rax,%r10 # backup twisted $rounds 2296 lea .Lxts_magic(%rip),%r8 2297 jmp .Lxts_dec_grandloop 2298 2299 .align 32 2300 .Lxts_dec_grandloop: 2301 movdqu `16*0`($inp),$inout0 # load input 2302 movdqa $rndkey0,$twmask 2303 movdqu `16*1`($inp),$inout1 2304 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2305 movdqu `16*2`($inp),$inout2 2306 pxor @tweak[1],$inout1 2307 aesdec $rndkey1,$inout0 2308 movdqu `16*3`($inp),$inout3 2309 pxor @tweak[2],$inout2 2310 aesdec $rndkey1,$inout1 2311 movdqu `16*4`($inp),$inout4 2312 pxor @tweak[3],$inout3 2313 aesdec $rndkey1,$inout2 2314 movdqu `16*5`($inp),$inout5 2315 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2316 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2317 pxor @tweak[4],$inout4 2318 aesdec $rndkey1,$inout3 2319 $movkey 32($key_),$rndkey0 2320 lea `16*6`($inp),$inp 2321 pxor $twmask,$inout5 2322 2323 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 2324 aesdec $rndkey1,$inout4 2325 pxor $twres,@tweak[1] 2326 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2327 aesdec $rndkey1,$inout5 2328 $movkey 48($key_),$rndkey1 2329 pxor $twres,@tweak[2] 2330 2331 aesdec $rndkey0,$inout0 2332 pxor $twres,@tweak[3] 2333 movdqa @tweak[1],`16*1`(%rsp) 2334 aesdec $rndkey0,$inout1 2335 pxor $twres,@tweak[4] 2336 movdqa @tweak[2],`16*2`(%rsp) 2337 aesdec $rndkey0,$inout2 2338 aesdec $rndkey0,$inout3 2339 pxor $twres,$twmask 2340 movdqa @tweak[4],`16*4`(%rsp) 2341 aesdec $rndkey0,$inout4 2342 aesdec $rndkey0,$inout5 2343 $movkey 64($key_),$rndkey0 2344 movdqa $twmask,`16*5`(%rsp) 2345 pshufd \$0x5f,@tweak[5],$twres 2346 jmp .Lxts_dec_loop6 2347 .align 32 2348 .Lxts_dec_loop6: 2349 aesdec $rndkey1,$inout0 2350 aesdec $rndkey1,$inout1 2351 aesdec $rndkey1,$inout2 2352 aesdec $rndkey1,$inout3 2353 aesdec $rndkey1,$inout4 2354 aesdec $rndkey1,$inout5 2355 $movkey -64($key,%rax),$rndkey1 2356 add \$32,%rax 2357 2358 aesdec $rndkey0,$inout0 2359 aesdec $rndkey0,$inout1 2360 aesdec $rndkey0,$inout2 2361 aesdec $rndkey0,$inout3 2362 aesdec $rndkey0,$inout4 2363 aesdec $rndkey0,$inout5 2364 $movkey -80($key,%rax),$rndkey0 2365 jnz .Lxts_dec_loop6 2366 2367 movdqa (%r8),$twmask # start calculating next tweak 2368 movdqa $twres,$twtmp 2369 paddd $twres,$twres 2370 aesdec $rndkey1,$inout0 2371 paddq @tweak[5],@tweak[5] 2372 psrad \$31,$twtmp 2373 aesdec $rndkey1,$inout1 2374 pand $twmask,$twtmp 2375 $movkey ($key_),@tweak[0] # load round[0] 2376 aesdec $rndkey1,$inout2 2377 aesdec $rndkey1,$inout3 2378 aesdec $rndkey1,$inout4 2379 pxor $twtmp,@tweak[5] 2380 movaps @tweak[0],@tweak[1] # copy round[0] 2381 aesdec $rndkey1,$inout5 2382 $movkey -64($key),$rndkey1 2383 2384 movdqa $twres,$twtmp 2385 aesdec $rndkey0,$inout0 2386 paddd $twres,$twres 2387 pxor @tweak[5],@tweak[0] 2388 aesdec $rndkey0,$inout1 2389 psrad \$31,$twtmp 2390 paddq @tweak[5],@tweak[5] 2391 aesdec $rndkey0,$inout2 2392 aesdec $rndkey0,$inout3 2393 pand $twmask,$twtmp 2394 movaps @tweak[1],@tweak[2] 2395 aesdec $rndkey0,$inout4 2396 pxor $twtmp,@tweak[5] 2397 movdqa $twres,$twtmp 2398 aesdec $rndkey0,$inout5 2399 $movkey -48($key),$rndkey0 2400 2401 paddd $twres,$twres 2402 aesdec $rndkey1,$inout0 2403 pxor @tweak[5],@tweak[1] 2404 psrad \$31,$twtmp 2405 aesdec $rndkey1,$inout1 2406 paddq @tweak[5],@tweak[5] 2407 pand $twmask,$twtmp 2408 aesdec $rndkey1,$inout2 2409 aesdec $rndkey1,$inout3 2410 movdqa @tweak[3],`16*3`(%rsp) 2411 pxor $twtmp,@tweak[5] 2412 aesdec $rndkey1,$inout4 2413 movaps @tweak[2],@tweak[3] 2414 movdqa $twres,$twtmp 2415 aesdec $rndkey1,$inout5 2416 $movkey -32($key),$rndkey1 2417 2418 paddd $twres,$twres 2419 aesdec $rndkey0,$inout0 2420 pxor @tweak[5],@tweak[2] 2421 psrad \$31,$twtmp 2422 aesdec $rndkey0,$inout1 2423 paddq @tweak[5],@tweak[5] 2424 pand $twmask,$twtmp 2425 aesdec $rndkey0,$inout2 2426 aesdec $rndkey0,$inout3 2427 aesdec $rndkey0,$inout4 2428 pxor $twtmp,@tweak[5] 2429 movaps @tweak[3],@tweak[4] 2430 aesdec $rndkey0,$inout5 2431 2432 movdqa $twres,$rndkey0 2433 paddd $twres,$twres 2434 aesdec $rndkey1,$inout0 2435 pxor @tweak[5],@tweak[3] 2436 psrad \$31,$rndkey0 2437 aesdec $rndkey1,$inout1 2438 paddq @tweak[5],@tweak[5] 2439 pand $twmask,$rndkey0 2440 aesdec $rndkey1,$inout2 2441 aesdec $rndkey1,$inout3 2442 pxor $rndkey0,@tweak[5] 2443 $movkey ($key_),$rndkey0 2444 aesdec $rndkey1,$inout4 2445 aesdec $rndkey1,$inout5 2446 $movkey 16($key_),$rndkey1 2447 2448 pxor @tweak[5],@tweak[4] 2449 aesdeclast `16*0`(%rsp),$inout0 2450 psrad \$31,$twres 2451 paddq @tweak[5],@tweak[5] 2452 aesdeclast `16*1`(%rsp),$inout1 2453 aesdeclast `16*2`(%rsp),$inout2 2454 pand $twmask,$twres 2455 mov %r10,%rax # restore $rounds 2456 aesdeclast `16*3`(%rsp),$inout3 2457 aesdeclast `16*4`(%rsp),$inout4 2458 aesdeclast `16*5`(%rsp),$inout5 2459 pxor $twres,@tweak[5] 2460 2461 lea `16*6`($out),$out # $out+=6*16 2462 movups $inout0,`-16*6`($out) # store 6 output blocks 2463 movups $inout1,`-16*5`($out) 2464 movups $inout2,`-16*4`($out) 2465 movups $inout3,`-16*3`($out) 2466 movups $inout4,`-16*2`($out) 2467 movups $inout5,`-16*1`($out) 2468 sub \$16*6,$len 2469 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2470 2471 mov \$16+96,$rounds 2472 sub $rnds_,$rounds 2473 mov $key_,$key # restore $key 2474 shr \$4,$rounds # restore original value 2475 2476 .Lxts_dec_short: 2477 # at the point @tweak[0..5] are populated with tweak values 2478 mov $rounds,$rnds_ # backup $rounds 2479 pxor $rndkey0,@tweak[0] 2480 pxor $rndkey0,@tweak[1] 2481 add \$16*6,$len # restore real remaining $len 2482 jz .Lxts_dec_done # done if ($len==0) 2483 2484 pxor $rndkey0,@tweak[2] 2485 cmp \$0x20,$len 2486 jb .Lxts_dec_one # $len is 1*16 2487 pxor $rndkey0,@tweak[3] 2488 je .Lxts_dec_two # $len is 2*16 2489 2490 pxor $rndkey0,@tweak[4] 2491 cmp \$0x40,$len 2492 jb .Lxts_dec_three # $len is 3*16 2493 je .Lxts_dec_four # $len is 4*16 2494 2495 movdqu ($inp),$inout0 # $len is 5*16 2496 movdqu 16*1($inp),$inout1 2497 movdqu 16*2($inp),$inout2 2498 pxor @tweak[0],$inout0 2499 movdqu 16*3($inp),$inout3 2500 pxor @tweak[1],$inout1 2501 movdqu 16*4($inp),$inout4 2502 lea 16*5($inp),$inp # $inp+=5*16 2503 pxor @tweak[2],$inout2 2504 pxor @tweak[3],$inout3 2505 pxor @tweak[4],$inout4 2506 2507 call _aesni_decrypt6 2508 2509 xorps @tweak[0],$inout0 2510 xorps @tweak[1],$inout1 2511 xorps @tweak[2],$inout2 2512 movdqu $inout0,($out) # store 5 output blocks 2513 xorps @tweak[3],$inout3 2514 movdqu $inout1,16*1($out) 2515 xorps @tweak[4],$inout4 2516 movdqu $inout2,16*2($out) 2517 pxor $twtmp,$twtmp 2518 movdqu $inout3,16*3($out) 2519 pcmpgtd @tweak[5],$twtmp 2520 movdqu $inout4,16*4($out) 2521 lea 16*5($out),$out # $out+=5*16 2522 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2523 and \$15,$len_ 2524 jz .Lxts_dec_ret 2525 2526 movdqa @tweak[5],@tweak[0] 2527 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2528 pand $twmask,@tweak[1] # isolate carry and residue 2529 pxor @tweak[5],@tweak[1] 2530 jmp .Lxts_dec_done2 2531 2532 .align 16 2533 .Lxts_dec_one: 2534 movups ($inp),$inout0 2535 lea 16*1($inp),$inp # $inp+=1*16 2536 xorps @tweak[0],$inout0 2537 ___ 2538 &aesni_generate1("dec",$key,$rounds); 2539 $code.=<<___; 2540 xorps @tweak[0],$inout0 2541 movdqa @tweak[1],@tweak[0] 2542 movups $inout0,($out) # store one output block 2543 movdqa @tweak[2],@tweak[1] 2544 lea 16*1($out),$out # $out+=1*16 2545 jmp .Lxts_dec_done 2546 2547 .align 16 2548 .Lxts_dec_two: 2549 movups ($inp),$inout0 2550 movups 16($inp),$inout1 2551 lea 32($inp),$inp # $inp+=2*16 2552 xorps @tweak[0],$inout0 2553 xorps @tweak[1],$inout1 2554 2555 call _aesni_decrypt2 2556 2557 xorps @tweak[0],$inout0 2558 movdqa @tweak[2],@tweak[0] 2559 xorps @tweak[1],$inout1 2560 movdqa @tweak[3],@tweak[1] 2561 movups $inout0,($out) # store 2 output blocks 2562 movups $inout1,16*1($out) 2563 lea 16*2($out),$out # $out+=2*16 2564 jmp .Lxts_dec_done 2565 2566 .align 16 2567 .Lxts_dec_three: 2568 movups ($inp),$inout0 2569 movups 16*1($inp),$inout1 2570 movups 16*2($inp),$inout2 2571 lea 16*3($inp),$inp # $inp+=3*16 2572 xorps @tweak[0],$inout0 2573 xorps @tweak[1],$inout1 2574 xorps @tweak[2],$inout2 2575 2576 call _aesni_decrypt3 2577 2578 xorps @tweak[0],$inout0 2579 movdqa @tweak[3],@tweak[0] 2580 xorps @tweak[1],$inout1 2581 movdqa @tweak[4],@tweak[1] 2582 xorps @tweak[2],$inout2 2583 movups $inout0,($out) # store 3 output blocks 2584 movups $inout1,16*1($out) 2585 movups $inout2,16*2($out) 2586 lea 16*3($out),$out # $out+=3*16 2587 jmp .Lxts_dec_done 2588 2589 .align 16 2590 .Lxts_dec_four: 2591 movups ($inp),$inout0 2592 movups 16*1($inp),$inout1 2593 movups 16*2($inp),$inout2 2594 xorps @tweak[0],$inout0 2595 movups 16*3($inp),$inout3 2596 lea 16*4($inp),$inp # $inp+=4*16 2597 xorps @tweak[1],$inout1 2598 xorps @tweak[2],$inout2 2599 xorps @tweak[3],$inout3 2600 2601 call _aesni_decrypt4 2602 2603 pxor @tweak[0],$inout0 2604 movdqa @tweak[4],@tweak[0] 2605 pxor @tweak[1],$inout1 2606 movdqa @tweak[5],@tweak[1] 2607 pxor @tweak[2],$inout2 2608 movdqu $inout0,($out) # store 4 output blocks 2609 pxor @tweak[3],$inout3 2610 movdqu $inout1,16*1($out) 2611 movdqu $inout2,16*2($out) 2612 movdqu $inout3,16*3($out) 2613 lea 16*4($out),$out # $out+=4*16 2614 jmp .Lxts_dec_done 2615 2616 .align 16 2617 .Lxts_dec_done: 2618 and \$15,$len_ # see if $len%16 is 0 2619 jz .Lxts_dec_ret 2620 .Lxts_dec_done2: 2621 mov $len_,$len 2622 mov $key_,$key # restore $key 2623 mov $rnds_,$rounds # restore $rounds 2624 2625 movups ($inp),$inout0 2626 xorps @tweak[1],$inout0 2627 ___ 2628 &aesni_generate1("dec",$key,$rounds); 2629 $code.=<<___; 2630 xorps @tweak[1],$inout0 2631 movups $inout0,($out) 2632 2633 .Lxts_dec_steal: 2634 movzb 16($inp),%eax # borrow $rounds ... 2635 movzb ($out),%ecx # ... and $key 2636 lea 1($inp),$inp 2637 mov %al,($out) 2638 mov %cl,16($out) 2639 lea 1($out),$out 2640 sub \$1,$len 2641 jnz .Lxts_dec_steal 2642 2643 sub $len_,$out # rewind $out 2644 mov $key_,$key # restore $key 2645 mov $rnds_,$rounds # restore $rounds 2646 2647 movups ($out),$inout0 2648 xorps @tweak[0],$inout0 2649 ___ 2650 &aesni_generate1("dec",$key,$rounds); 2651 $code.=<<___; 2652 xorps @tweak[0],$inout0 2653 movups $inout0,($out) 2654 2655 .Lxts_dec_ret: 2656 xorps %xmm0,%xmm0 # clear register bank 2657 pxor %xmm1,%xmm1 2658 pxor %xmm2,%xmm2 2659 pxor %xmm3,%xmm3 2660 pxor %xmm4,%xmm4 2661 pxor %xmm5,%xmm5 2662 ___ 2663 $code.=<<___ if (!$win64); 2664 pxor %xmm6,%xmm6 2665 pxor %xmm7,%xmm7 2666 movaps %xmm0,0x00(%rsp) # clear stack 2667 pxor %xmm8,%xmm8 2668 movaps %xmm0,0x10(%rsp) 2669 pxor %xmm9,%xmm9 2670 movaps %xmm0,0x20(%rsp) 2671 pxor %xmm10,%xmm10 2672 movaps %xmm0,0x30(%rsp) 2673 pxor %xmm11,%xmm11 2674 movaps %xmm0,0x40(%rsp) 2675 pxor %xmm12,%xmm12 2676 movaps %xmm0,0x50(%rsp) 2677 pxor %xmm13,%xmm13 2678 movaps %xmm0,0x60(%rsp) 2679 pxor %xmm14,%xmm14 2680 pxor %xmm15,%xmm15 2681 ___ 2682 $code.=<<___ if ($win64); 2683 movaps -0xa0(%rbp),%xmm6 2684 movaps %xmm0,-0xa0(%rbp) # clear stack 2685 movaps -0x90(%rbp),%xmm7 2686 movaps %xmm0,-0x90(%rbp) 2687 movaps -0x80(%rbp),%xmm8 2688 movaps %xmm0,-0x80(%rbp) 2689 movaps -0x70(%rbp),%xmm9 2690 movaps %xmm0,-0x70(%rbp) 2691 movaps -0x60(%rbp),%xmm10 2692 movaps %xmm0,-0x60(%rbp) 2693 movaps -0x50(%rbp),%xmm11 2694 movaps %xmm0,-0x50(%rbp) 2695 movaps -0x40(%rbp),%xmm12 2696 movaps %xmm0,-0x40(%rbp) 2697 movaps -0x30(%rbp),%xmm13 2698 movaps %xmm0,-0x30(%rbp) 2699 movaps -0x20(%rbp),%xmm14 2700 movaps %xmm0,-0x20(%rbp) 2701 movaps -0x10(%rbp),%xmm15 2702 movaps %xmm0,-0x10(%rbp) 2703 movaps %xmm0,0x00(%rsp) 2704 movaps %xmm0,0x10(%rsp) 2705 movaps %xmm0,0x20(%rsp) 2706 movaps %xmm0,0x30(%rsp) 2707 movaps %xmm0,0x40(%rsp) 2708 movaps %xmm0,0x50(%rsp) 2709 movaps %xmm0,0x60(%rsp) 2710 ___ 2711 $code.=<<___; 2712 lea (%rbp),%rsp 2713 pop %rbp 2714 .Lxts_dec_epilogue: 2715 ret 2716 .size aesni_xts_decrypt,.-aesni_xts_decrypt 2717 ___ 2718 } }} 2719 2721 ######################################################################## 2722 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 2723 # size_t length, const AES_KEY *key, 2724 # unsigned char *ivp,const int enc); 2725 { 2726 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 2727 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 2728 my $inp_=$key_; 2729 2730 $code.=<<___; 2731 .globl ${PREFIX}_cbc_encrypt 2732 .type ${PREFIX}_cbc_encrypt,\@function,6 2733 .align 16 2734 ${PREFIX}_cbc_encrypt: 2735 test $len,$len # check length 2736 jz .Lcbc_ret 2737 2738 mov 240($key),$rnds_ # key->rounds 2739 mov $key,$key_ # backup $key 2740 test %r9d,%r9d # 6th argument 2741 jz .Lcbc_decrypt 2742 #--------------------------- CBC ENCRYPT ------------------------------# 2743 movups ($ivp),$inout0 # load iv as initial state 2744 mov $rnds_,$rounds 2745 cmp \$16,$len 2746 jb .Lcbc_enc_tail 2747 sub \$16,$len 2748 jmp .Lcbc_enc_loop 2749 .align 16 2750 .Lcbc_enc_loop: 2751 movups ($inp),$inout1 # load input 2752 lea 16($inp),$inp 2753 #xorps $inout1,$inout0 2754 ___ 2755 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 2756 $code.=<<___; 2757 mov $rnds_,$rounds # restore $rounds 2758 mov $key_,$key # restore $key 2759 movups $inout0,0($out) # store output 2760 lea 16($out),$out 2761 sub \$16,$len 2762 jnc .Lcbc_enc_loop 2763 add \$16,$len 2764 jnz .Lcbc_enc_tail 2765 pxor $rndkey0,$rndkey0 # clear register bank 2766 pxor $rndkey1,$rndkey1 2767 movups $inout0,($ivp) 2768 pxor $inout0,$inout0 2769 pxor $inout1,$inout1 2770 jmp .Lcbc_ret 2771 2772 .Lcbc_enc_tail: 2773 mov $len,%rcx # zaps $key 2774 xchg $inp,$out # $inp is %rsi and $out is %rdi now 2775 .long 0x9066A4F3 # rep movsb 2776 mov \$16,%ecx # zero tail 2777 sub $len,%rcx 2778 xor %eax,%eax 2779 .long 0x9066AAF3 # rep stosb 2780 lea -16(%rdi),%rdi # rewind $out by 1 block 2781 mov $rnds_,$rounds # restore $rounds 2782 mov %rdi,%rsi # $inp and $out are the same 2783 mov $key_,$key # restore $key 2784 xor $len,$len # len=16 2785 jmp .Lcbc_enc_loop # one more spin 2786 #--------------------------- CBC DECRYPT ------------------------------# 2788 .align 16 2789 .Lcbc_decrypt: 2790 cmp \$16,$len 2791 jne .Lcbc_decrypt_bulk 2792 2793 # handle single block without allocating stack frame, 2794 # useful in ciphertext stealing mode 2795 movdqu ($inp),$inout0 # load input 2796 movdqu ($ivp),$inout1 # load iv 2797 movdqa $inout0,$inout2 # future iv 2798 ___ 2799 &aesni_generate1("dec",$key,$rnds_); 2800 $code.=<<___; 2801 pxor $rndkey0,$rndkey0 # clear register bank 2802 pxor $rndkey1,$rndkey1 2803 movdqu $inout2,($ivp) # store iv 2804 xorps $inout1,$inout0 # ^=iv 2805 pxor $inout1,$inout1 2806 movups $inout0,($out) # store output 2807 pxor $inout0,$inout0 2808 jmp .Lcbc_ret 2809 .align 16 2810 .Lcbc_decrypt_bulk: 2811 lea (%rsp),%rax 2812 push %rbp 2813 sub \$$frame_size,%rsp 2814 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2815 ___ 2816 $code.=<<___ if ($win64); 2817 movaps %xmm6,0x10(%rsp) 2818 movaps %xmm7,0x20(%rsp) 2819 movaps %xmm8,0x30(%rsp) 2820 movaps %xmm9,0x40(%rsp) 2821 movaps %xmm10,0x50(%rsp) 2822 movaps %xmm11,0x60(%rsp) 2823 movaps %xmm12,0x70(%rsp) 2824 movaps %xmm13,0x80(%rsp) 2825 movaps %xmm14,0x90(%rsp) 2826 movaps %xmm15,0xa0(%rsp) 2827 .Lcbc_decrypt_body: 2828 ___ 2829 $code.=<<___; 2830 lea -8(%rax),%rbp 2831 movups ($ivp),$iv 2832 mov $rnds_,$rounds 2833 cmp \$0x50,$len 2834 jbe .Lcbc_dec_tail 2835 2836 $movkey ($key),$rndkey0 2837 movdqu 0x00($inp),$inout0 # load input 2838 movdqu 0x10($inp),$inout1 2839 movdqa $inout0,$in0 2840 movdqu 0x20($inp),$inout2 2841 movdqa $inout1,$in1 2842 movdqu 0x30($inp),$inout3 2843 movdqa $inout2,$in2 2844 movdqu 0x40($inp),$inout4 2845 movdqa $inout3,$in3 2846 movdqu 0x50($inp),$inout5 2847 movdqa $inout4,$in4 2848 mov OPENSSL_ia32cap_P+4(%rip),%r9d 2849 cmp \$0x70,$len 2850 jbe .Lcbc_dec_six_or_seven 2851 2852 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 2853 sub \$0x50,$len # $len is biased by -5*16 2854 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 2855 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 2856 sub \$0x20,$len # $len is biased by -7*16 2857 lea 0x70($key),$key # size optimization 2858 jmp .Lcbc_dec_loop8_enter 2859 .align 16 2860 .Lcbc_dec_loop8: 2861 movups $inout7,($out) 2862 lea 0x10($out),$out 2863 .Lcbc_dec_loop8_enter: 2864 movdqu 0x60($inp),$inout6 2865 pxor $rndkey0,$inout0 2866 movdqu 0x70($inp),$inout7 2867 pxor $rndkey0,$inout1 2868 $movkey 0x10-0x70($key),$rndkey1 2869 pxor $rndkey0,$inout2 2870 xor $inp_,$inp_ 2871 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 2872 pxor $rndkey0,$inout3 2873 pxor $rndkey0,$inout4 2874 pxor $rndkey0,$inout5 2875 pxor $rndkey0,$inout6 2876 2877 aesdec $rndkey1,$inout0 2878 pxor $rndkey0,$inout7 2879 $movkey 0x20-0x70($key),$rndkey0 2880 aesdec $rndkey1,$inout1 2881 aesdec $rndkey1,$inout2 2882 aesdec $rndkey1,$inout3 2883 aesdec $rndkey1,$inout4 2884 aesdec $rndkey1,$inout5 2885 aesdec $rndkey1,$inout6 2886 setnc ${inp_}b 2887 shl \$7,$inp_ 2888 aesdec $rndkey1,$inout7 2889 add $inp,$inp_ 2890 $movkey 0x30-0x70($key),$rndkey1 2891 ___ 2892 for($i=1;$i<12;$i++) { 2893 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 2894 $code.=<<___ if ($i==7); 2895 cmp \$11,$rounds 2896 ___ 2897 $code.=<<___; 2898 aesdec $rndkeyx,$inout0 2899 aesdec $rndkeyx,$inout1 2900 aesdec $rndkeyx,$inout2 2901 aesdec $rndkeyx,$inout3 2902 aesdec $rndkeyx,$inout4 2903 aesdec $rndkeyx,$inout5 2904 aesdec $rndkeyx,$inout6 2905 aesdec $rndkeyx,$inout7 2906 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 2907 ___ 2908 $code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 2909 nop 2910 ___ 2911 $code.=<<___ if ($i==7); 2912 jb .Lcbc_dec_done 2913 ___ 2914 $code.=<<___ if ($i==9); 2915 je .Lcbc_dec_done 2916 ___ 2917 $code.=<<___ if ($i==11); 2918 jmp .Lcbc_dec_done 2919 ___ 2920 } 2921 $code.=<<___; 2922 .align 16 2923 .Lcbc_dec_done: 2924 aesdec $rndkey1,$inout0 2925 aesdec $rndkey1,$inout1 2926 pxor $rndkey0,$iv 2927 pxor $rndkey0,$in0 2928 aesdec $rndkey1,$inout2 2929 aesdec $rndkey1,$inout3 2930 pxor $rndkey0,$in1 2931 pxor $rndkey0,$in2 2932 aesdec $rndkey1,$inout4 2933 aesdec $rndkey1,$inout5 2934 pxor $rndkey0,$in3 2935 pxor $rndkey0,$in4 2936 aesdec $rndkey1,$inout6 2937 aesdec $rndkey1,$inout7 2938 movdqu 0x50($inp),$rndkey1 2939 2940 aesdeclast $iv,$inout0 2941 movdqu 0x60($inp),$iv # borrow $iv 2942 pxor $rndkey0,$rndkey1 2943 aesdeclast $in0,$inout1 2944 pxor $rndkey0,$iv 2945 movdqu 0x70($inp),$rndkey0 # next IV 2946 aesdeclast $in1,$inout2 2947 lea 0x80($inp),$inp 2948 movdqu 0x00($inp_),$in0 2949 aesdeclast $in2,$inout3 2950 aesdeclast $in3,$inout4 2951 movdqu 0x10($inp_),$in1 2952 movdqu 0x20($inp_),$in2 2953 aesdeclast $in4,$inout5 2954 aesdeclast $rndkey1,$inout6 2955 movdqu 0x30($inp_),$in3 2956 movdqu 0x40($inp_),$in4 2957 aesdeclast $iv,$inout7 2958 movdqa $rndkey0,$iv # return $iv 2959 movdqu 0x50($inp_),$rndkey1 2960 $movkey -0x70($key),$rndkey0 2961 2962 movups $inout0,($out) # store output 2963 movdqa $in0,$inout0 2964 movups $inout1,0x10($out) 2965 movdqa $in1,$inout1 2966 movups $inout2,0x20($out) 2967 movdqa $in2,$inout2 2968 movups $inout3,0x30($out) 2969 movdqa $in3,$inout3 2970 movups $inout4,0x40($out) 2971 movdqa $in4,$inout4 2972 movups $inout5,0x50($out) 2973 movdqa $rndkey1,$inout5 2974 movups $inout6,0x60($out) 2975 lea 0x70($out),$out 2976 2977 sub \$0x80,$len 2978 ja .Lcbc_dec_loop8 2979 2980 movaps $inout7,$inout0 2981 lea -0x70($key),$key 2982 add \$0x70,$len 2983 jle .Lcbc_dec_clear_tail_collected 2984 movups $inout7,($out) 2985 lea 0x10($out),$out 2986 cmp \$0x50,$len 2987 jbe .Lcbc_dec_tail 2988 2989 movaps $in0,$inout0 2990 .Lcbc_dec_six_or_seven: 2991 cmp \$0x60,$len 2992 ja .Lcbc_dec_seven 2993 2994 movaps $inout5,$inout6 2995 call _aesni_decrypt6 2996 pxor $iv,$inout0 # ^= IV 2997 movaps $inout6,$iv 2998 pxor $in0,$inout1 2999 movdqu $inout0,($out) 3000 pxor $in1,$inout2 3001 movdqu $inout1,0x10($out) 3002 pxor $inout1,$inout1 # clear register bank 3003 pxor $in2,$inout3 3004 movdqu $inout2,0x20($out) 3005 pxor $inout2,$inout2 3006 pxor $in3,$inout4 3007 movdqu $inout3,0x30($out) 3008 pxor $inout3,$inout3 3009 pxor $in4,$inout5 3010 movdqu $inout4,0x40($out) 3011 pxor $inout4,$inout4 3012 lea 0x50($out),$out 3013 movdqa $inout5,$inout0 3014 pxor $inout5,$inout5 3015 jmp .Lcbc_dec_tail_collected 3016 3017 .align 16 3018 .Lcbc_dec_seven: 3019 movups 0x60($inp),$inout6 3020 xorps $inout7,$inout7 3021 call _aesni_decrypt8 3022 movups 0x50($inp),$inout7 3023 pxor $iv,$inout0 # ^= IV 3024 movups 0x60($inp),$iv 3025 pxor $in0,$inout1 3026 movdqu $inout0,($out) 3027 pxor $in1,$inout2 3028 movdqu $inout1,0x10($out) 3029 pxor $inout1,$inout1 # clear register bank 3030 pxor $in2,$inout3 3031 movdqu $inout2,0x20($out) 3032 pxor $inout2,$inout2 3033 pxor $in3,$inout4 3034 movdqu $inout3,0x30($out) 3035 pxor $inout3,$inout3 3036 pxor $in4,$inout5 3037 movdqu $inout4,0x40($out) 3038 pxor $inout4,$inout4 3039 pxor $inout7,$inout6 3040 movdqu $inout5,0x50($out) 3041 pxor $inout5,$inout5 3042 lea 0x60($out),$out 3043 movdqa $inout6,$inout0 3044 pxor $inout6,$inout6 3045 pxor $inout7,$inout7 3046 jmp .Lcbc_dec_tail_collected 3047 3048 .align 16 3049 .Lcbc_dec_loop6: 3050 movups $inout5,($out) 3051 lea 0x10($out),$out 3052 movdqu 0x00($inp),$inout0 # load input 3053 movdqu 0x10($inp),$inout1 3054 movdqa $inout0,$in0 3055 movdqu 0x20($inp),$inout2 3056 movdqa $inout1,$in1 3057 movdqu 0x30($inp),$inout3 3058 movdqa $inout2,$in2 3059 movdqu 0x40($inp),$inout4 3060 movdqa $inout3,$in3 3061 movdqu 0x50($inp),$inout5 3062 movdqa $inout4,$in4 3063 .Lcbc_dec_loop6_enter: 3064 lea 0x60($inp),$inp 3065 movdqa $inout5,$inout6 3066 3067 call _aesni_decrypt6 3068 3069 pxor $iv,$inout0 # ^= IV 3070 movdqa $inout6,$iv 3071 pxor $in0,$inout1 3072 movdqu $inout0,($out) 3073 pxor $in1,$inout2 3074 movdqu $inout1,0x10($out) 3075 pxor $in2,$inout3 3076 movdqu $inout2,0x20($out) 3077 pxor $in3,$inout4 3078 mov $key_,$key 3079 movdqu $inout3,0x30($out) 3080 pxor $in4,$inout5 3081 mov $rnds_,$rounds 3082 movdqu $inout4,0x40($out) 3083 lea 0x50($out),$out 3084 sub \$0x60,$len 3085 ja .Lcbc_dec_loop6 3086 3087 movdqa $inout5,$inout0 3088 add \$0x50,$len 3089 jle .Lcbc_dec_clear_tail_collected 3090 movups $inout5,($out) 3091 lea 0x10($out),$out 3092 3093 .Lcbc_dec_tail: 3094 movups ($inp),$inout0 3095 sub \$0x10,$len 3096 jbe .Lcbc_dec_one # $len is 1*16 or less 3097 3098 movups 0x10($inp),$inout1 3099 movaps $inout0,$in0 3100 sub \$0x10,$len 3101 jbe .Lcbc_dec_two # $len is 2*16 or less 3102 3103 movups 0x20($inp),$inout2 3104 movaps $inout1,$in1 3105 sub \$0x10,$len 3106 jbe .Lcbc_dec_three # $len is 3*16 or less 3107 3108 movups 0x30($inp),$inout3 3109 movaps $inout2,$in2 3110 sub \$0x10,$len 3111 jbe .Lcbc_dec_four # $len is 4*16 or less 3112 3113 movups 0x40($inp),$inout4 # $len is 5*16 or less 3114 movaps $inout3,$in3 3115 movaps $inout4,$in4 3116 xorps $inout5,$inout5 3117 call _aesni_decrypt6 3118 pxor $iv,$inout0 3119 movaps $in4,$iv 3120 pxor $in0,$inout1 3121 movdqu $inout0,($out) 3122 pxor $in1,$inout2 3123 movdqu $inout1,0x10($out) 3124 pxor $inout1,$inout1 # clear register bank 3125 pxor $in2,$inout3 3126 movdqu $inout2,0x20($out) 3127 pxor $inout2,$inout2 3128 pxor $in3,$inout4 3129 movdqu $inout3,0x30($out) 3130 pxor $inout3,$inout3 3131 lea 0x40($out),$out 3132 movdqa $inout4,$inout0 3133 pxor $inout4,$inout4 3134 pxor $inout5,$inout5 3135 sub \$0x10,$len 3136 jmp .Lcbc_dec_tail_collected 3137 3138 .align 16 3139 .Lcbc_dec_one: 3140 movaps $inout0,$in0 3141 ___ 3142 &aesni_generate1("dec",$key,$rounds); 3143 $code.=<<___; 3144 xorps $iv,$inout0 3145 movaps $in0,$iv 3146 jmp .Lcbc_dec_tail_collected 3147 .align 16 3148 .Lcbc_dec_two: 3149 movaps $inout1,$in1 3150 call _aesni_decrypt2 3151 pxor $iv,$inout0 3152 movaps $in1,$iv 3153 pxor $in0,$inout1 3154 movdqu $inout0,($out) 3155 movdqa $inout1,$inout0 3156 pxor $inout1,$inout1 # clear register bank 3157 lea 0x10($out),$out 3158 jmp .Lcbc_dec_tail_collected 3159 .align 16 3160 .Lcbc_dec_three: 3161 movaps $inout2,$in2 3162 call _aesni_decrypt3 3163 pxor $iv,$inout0 3164 movaps $in2,$iv 3165 pxor $in0,$inout1 3166 movdqu $inout0,($out) 3167 pxor $in1,$inout2 3168 movdqu $inout1,0x10($out) 3169 pxor $inout1,$inout1 # clear register bank 3170 movdqa $inout2,$inout0 3171 pxor $inout2,$inout2 3172 lea 0x20($out),$out 3173 jmp .Lcbc_dec_tail_collected 3174 .align 16 3175 .Lcbc_dec_four: 3176 movaps $inout3,$in3 3177 call _aesni_decrypt4 3178 pxor $iv,$inout0 3179 movaps $in3,$iv 3180 pxor $in0,$inout1 3181 movdqu $inout0,($out) 3182 pxor $in1,$inout2 3183 movdqu $inout1,0x10($out) 3184 pxor $inout1,$inout1 # clear register bank 3185 pxor $in2,$inout3 3186 movdqu $inout2,0x20($out) 3187 pxor $inout2,$inout2 3188 movdqa $inout3,$inout0 3189 pxor $inout3,$inout3 3190 lea 0x30($out),$out 3191 jmp .Lcbc_dec_tail_collected 3192 3193 .align 16 3194 .Lcbc_dec_clear_tail_collected: 3195 pxor $inout1,$inout1 # clear register bank 3196 pxor $inout2,$inout2 3197 pxor $inout3,$inout3 3198 ___ 3199 $code.=<<___ if (!$win64); 3200 pxor $inout4,$inout4 # %xmm6..9 3201 pxor $inout5,$inout5 3202 pxor $inout6,$inout6 3203 pxor $inout7,$inout7 3204 ___ 3205 $code.=<<___; 3206 .Lcbc_dec_tail_collected: 3207 movups $iv,($ivp) 3208 and \$15,$len 3209 jnz .Lcbc_dec_tail_partial 3210 movups $inout0,($out) 3211 pxor $inout0,$inout0 3212 jmp .Lcbc_dec_ret 3213 .align 16 3214 .Lcbc_dec_tail_partial: 3215 movaps $inout0,(%rsp) 3216 pxor $inout0,$inout0 3217 mov \$16,%rcx 3218 mov $out,%rdi 3219 sub $len,%rcx 3220 lea (%rsp),%rsi 3221 .long 0x9066A4F3 # rep movsb 3222 movdqa $inout0,(%rsp) 3223 3224 .Lcbc_dec_ret: 3225 xorps $rndkey0,$rndkey0 # %xmm0 3226 pxor $rndkey1,$rndkey1 3227 ___ 3228 $code.=<<___ if ($win64); 3229 movaps 0x10(%rsp),%xmm6 3230 movaps %xmm0,0x10(%rsp) # clear stack 3231 movaps 0x20(%rsp),%xmm7 3232 movaps %xmm0,0x20(%rsp) 3233 movaps 0x30(%rsp),%xmm8 3234 movaps %xmm0,0x30(%rsp) 3235 movaps 0x40(%rsp),%xmm9 3236 movaps %xmm0,0x40(%rsp) 3237 movaps 0x50(%rsp),%xmm10 3238 movaps %xmm0,0x50(%rsp) 3239 movaps 0x60(%rsp),%xmm11 3240 movaps %xmm0,0x60(%rsp) 3241 movaps 0x70(%rsp),%xmm12 3242 movaps %xmm0,0x70(%rsp) 3243 movaps 0x80(%rsp),%xmm13 3244 movaps %xmm0,0x80(%rsp) 3245 movaps 0x90(%rsp),%xmm14 3246 movaps %xmm0,0x90(%rsp) 3247 movaps 0xa0(%rsp),%xmm15 3248 movaps %xmm0,0xa0(%rsp) 3249 ___ 3250 $code.=<<___; 3251 lea (%rbp),%rsp 3252 pop %rbp 3253 .Lcbc_ret: 3254 ret 3255 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 3256 ___ 3257 } 3259 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 3260 # int bits, AES_KEY *key) 3261 # 3262 # input: $inp user-supplied key 3263 # $bits $inp length in bits 3264 # $key pointer to key schedule 3265 # output: %eax 0 denoting success, -1 or -2 - failure (see C) 3266 # *$key key schedule 3267 # 3268 { my ($inp,$bits,$key) = @_4args; 3269 $bits =~ s/%r/%e/; 3270 3271 $code.=<<___; 3272 .globl ${PREFIX}_set_decrypt_key 3273 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 3274 .align 16 3275 ${PREFIX}_set_decrypt_key: 3276 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3277 call __aesni_set_encrypt_key 3278 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 3279 test %eax,%eax 3280 jnz .Ldec_key_ret 3281 lea 16($key,$bits),$inp # points at the end of key schedule 3282 3283 $movkey ($key),%xmm0 # just swap 3284 $movkey ($inp),%xmm1 3285 $movkey %xmm0,($inp) 3286 $movkey %xmm1,($key) 3287 lea 16($key),$key 3288 lea -16($inp),$inp 3289 3290 .Ldec_key_inverse: 3291 $movkey ($key),%xmm0 # swap and inverse 3292 $movkey ($inp),%xmm1 3293 aesimc %xmm0,%xmm0 3294 aesimc %xmm1,%xmm1 3295 lea 16($key),$key 3296 lea -16($inp),$inp 3297 $movkey %xmm0,16($inp) 3298 $movkey %xmm1,-16($key) 3299 cmp $key,$inp 3300 ja .Ldec_key_inverse 3301 3302 $movkey ($key),%xmm0 # inverse middle 3303 aesimc %xmm0,%xmm0 3304 pxor %xmm1,%xmm1 3305 $movkey %xmm0,($inp) 3306 pxor %xmm0,%xmm0 3307 .Ldec_key_ret: 3308 add \$8,%rsp 3309 ret 3310 .LSEH_end_set_decrypt_key: 3311 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 3312 ___ 3313 3315 # This is based on submission by 3316 # 3317 # Huang Ying <ying.huang (at] intel.com> 3318 # Vinodh Gopal <vinodh.gopal (at] intel.com> 3319 # Kahraman Akdemir 3320 # 3321 # Agressively optimized in respect to aeskeygenassist's critical path 3322 # and is contained in %xmm0-5 to meet Win64 ABI requirement. 3323 # 3324 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 3325 # int bits, AES_KEY * const key); 3326 # 3327 # input: $inp user-supplied key 3328 # $bits $inp length in bits 3329 # $key pointer to key schedule 3330 # output: %eax 0 denoting success, -1 or -2 - failure (see C) 3331 # $bits rounds-1 (used in aesni_set_decrypt_key) 3332 # *$key key schedule 3333 # $key pointer to key schedule (used in 3334 # aesni_set_decrypt_key) 3335 # 3336 # Subroutine is frame-less, which means that only volatile registers 3337 # are used. Note that it's declared "abi-omnipotent", which means that 3338 # amount of volatile registers is smaller on Windows. 3339 # 3340 $code.=<<___; 3341 .globl ${PREFIX}_set_encrypt_key 3342 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 3343 .align 16 3344 ${PREFIX}_set_encrypt_key: 3345 __aesni_set_encrypt_key: 3346 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3347 mov \$-1,%rax 3348 test $inp,$inp 3349 jz .Lenc_key_ret 3350 test $key,$key 3351 jz .Lenc_key_ret 3352 3353 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 3354 movups ($inp),%xmm0 # pull first 128 bits of *userKey 3355 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 3356 and OPENSSL_ia32cap_P+4(%rip),%r10d 3357 lea 16($key),%rax # %rax is used as modifiable copy of $key 3358 cmp \$256,$bits 3359 je .L14rounds 3360 cmp \$192,$bits 3361 je .L12rounds 3362 cmp \$128,$bits 3363 jne .Lbad_keybits 3364 3365 .L10rounds: 3366 mov \$9,$bits # 10 rounds for 128-bit key 3367 cmp \$`1<<28`,%r10d # AVX, bit no XOP 3368 je .L10rounds_alt 3369 3370 $movkey %xmm0,($key) # round 0 3371 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 3372 call .Lkey_expansion_128_cold 3373 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 3374 call .Lkey_expansion_128 3375 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 3376 call .Lkey_expansion_128 3377 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 3378 call .Lkey_expansion_128 3379 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 3380 call .Lkey_expansion_128 3381 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 3382 call .Lkey_expansion_128 3383 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 3384 call .Lkey_expansion_128 3385 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 3386 call .Lkey_expansion_128 3387 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 3388 call .Lkey_expansion_128 3389 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 3390 call .Lkey_expansion_128 3391 $movkey %xmm0,(%rax) 3392 mov $bits,80(%rax) # 240(%rdx) 3393 xor %eax,%eax 3394 jmp .Lenc_key_ret 3395 3396 .align 16 3397 .L10rounds_alt: 3398 movdqa .Lkey_rotate(%rip),%xmm5 3399 mov \$8,%r10d 3400 movdqa .Lkey_rcon1(%rip),%xmm4 3401 movdqa %xmm0,%xmm2 3402 movdqu %xmm0,($key) 3403 jmp .Loop_key128 3404 3405 .align 16 3406 .Loop_key128: 3407 pshufb %xmm5,%xmm0 3408 aesenclast %xmm4,%xmm0 3409 pslld \$1,%xmm4 3410 lea 16(%rax),%rax 3411 3412 movdqa %xmm2,%xmm3 3413 pslldq \$4,%xmm2 3414 pxor %xmm2,%xmm3 3415 pslldq \$4,%xmm2 3416 pxor %xmm2,%xmm3 3417 pslldq \$4,%xmm2 3418 pxor %xmm3,%xmm2 3419 3420 pxor %xmm2,%xmm0 3421 movdqu %xmm0,-16(%rax) 3422 movdqa %xmm0,%xmm2 3423 3424 dec %r10d 3425 jnz .Loop_key128 3426 3427 movdqa .Lkey_rcon1b(%rip),%xmm4 3428 3429 pshufb %xmm5,%xmm0 3430 aesenclast %xmm4,%xmm0 3431 pslld \$1,%xmm4 3432 3433 movdqa %xmm2,%xmm3 3434 pslldq \$4,%xmm2 3435 pxor %xmm2,%xmm3 3436 pslldq \$4,%xmm2 3437 pxor %xmm2,%xmm3 3438 pslldq \$4,%xmm2 3439 pxor %xmm3,%xmm2 3440 3441 pxor %xmm2,%xmm0 3442 movdqu %xmm0,(%rax) 3443 3444 movdqa %xmm0,%xmm2 3445 pshufb %xmm5,%xmm0 3446 aesenclast %xmm4,%xmm0 3447 3448 movdqa %xmm2,%xmm3 3449 pslldq \$4,%xmm2 3450 pxor %xmm2,%xmm3 3451 pslldq \$4,%xmm2 3452 pxor %xmm2,%xmm3 3453 pslldq \$4,%xmm2 3454 pxor %xmm3,%xmm2 3455 3456 pxor %xmm2,%xmm0 3457 movdqu %xmm0,16(%rax) 3458 3459 mov $bits,96(%rax) # 240($key) 3460 xor %eax,%eax 3461 jmp .Lenc_key_ret 3462 3463 .align 16 3464 .L12rounds: 3465 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 3466 mov \$11,$bits # 12 rounds for 192 3467 cmp \$`1<<28`,%r10d # AVX, but no XOP 3468 je .L12rounds_alt 3469 3470 $movkey %xmm0,($key) # round 0 3471 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 3472 call .Lkey_expansion_192a_cold 3473 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 3474 call .Lkey_expansion_192b 3475 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 3476 call .Lkey_expansion_192a 3477 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 3478 call .Lkey_expansion_192b 3479 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 3480 call .Lkey_expansion_192a 3481 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 3482 call .Lkey_expansion_192b 3483 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 3484 call .Lkey_expansion_192a 3485 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 3486 call .Lkey_expansion_192b 3487 $movkey %xmm0,(%rax) 3488 mov $bits,48(%rax) # 240(%rdx) 3489 xor %rax, %rax 3490 jmp .Lenc_key_ret 3491 3492 .align 16 3493 .L12rounds_alt: 3494 movdqa .Lkey_rotate192(%rip),%xmm5 3495 movdqa .Lkey_rcon1(%rip),%xmm4 3496 mov \$8,%r10d 3497 movdqu %xmm0,($key) 3498 jmp .Loop_key192 3499 3500 .align 16 3501 .Loop_key192: 3502 movq %xmm2,0(%rax) 3503 movdqa %xmm2,%xmm1 3504 pshufb %xmm5,%xmm2 3505 aesenclast %xmm4,%xmm2 3506 pslld \$1, %xmm4 3507 lea 24(%rax),%rax 3508 3509 movdqa %xmm0,%xmm3 3510 pslldq \$4,%xmm0 3511 pxor %xmm0,%xmm3 3512 pslldq \$4,%xmm0 3513 pxor %xmm0,%xmm3 3514 pslldq \$4,%xmm0 3515 pxor %xmm3,%xmm0 3516 3517 pshufd \$0xff,%xmm0,%xmm3 3518 pxor %xmm1,%xmm3 3519 pslldq \$4,%xmm1 3520 pxor %xmm1,%xmm3 3521 3522 pxor %xmm2,%xmm0 3523 pxor %xmm3,%xmm2 3524 movdqu %xmm0,-16(%rax) 3525 3526 dec %r10d 3527 jnz .Loop_key192 3528 3529 mov $bits,32(%rax) # 240($key) 3530 xor %eax,%eax 3531 jmp .Lenc_key_ret 3532 3533 .align 16 3534 .L14rounds: 3535 movups 16($inp),%xmm2 # remaning half of *userKey 3536 mov \$13,$bits # 14 rounds for 256 3537 lea 16(%rax),%rax 3538 cmp \$`1<<28`,%r10d # AVX, but no XOP 3539 je .L14rounds_alt 3540 3541 $movkey %xmm0,($key) # round 0 3542 $movkey %xmm2,16($key) # round 1 3543 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 3544 call .Lkey_expansion_256a_cold 3545 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 3546 call .Lkey_expansion_256b 3547 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 3548 call .Lkey_expansion_256a 3549 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 3550 call .Lkey_expansion_256b 3551 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 3552 call .Lkey_expansion_256a 3553 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 3554 call .Lkey_expansion_256b 3555 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 3556 call .Lkey_expansion_256a 3557 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 3558 call .Lkey_expansion_256b 3559 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 3560 call .Lkey_expansion_256a 3561 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 3562 call .Lkey_expansion_256b 3563 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 3564 call .Lkey_expansion_256a 3565 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 3566 call .Lkey_expansion_256b 3567 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 3568 call .Lkey_expansion_256a 3569 $movkey %xmm0,(%rax) 3570 mov $bits,16(%rax) # 240(%rdx) 3571 xor %rax,%rax 3572 jmp .Lenc_key_ret 3573 3574 .align 16 3575 .L14rounds_alt: 3576 movdqa .Lkey_rotate(%rip),%xmm5 3577 movdqa .Lkey_rcon1(%rip),%xmm4 3578 mov \$7,%r10d 3579 movdqu %xmm0,0($key) 3580 movdqa %xmm2,%xmm1 3581 movdqu %xmm2,16($key) 3582 jmp .Loop_key256 3583 3584 .align 16 3585 .Loop_key256: 3586 pshufb %xmm5,%xmm2 3587 aesenclast %xmm4,%xmm2 3588 3589 movdqa %xmm0,%xmm3 3590 pslldq \$4,%xmm0 3591 pxor %xmm0,%xmm3 3592 pslldq \$4,%xmm0 3593 pxor %xmm0,%xmm3 3594 pslldq \$4,%xmm0 3595 pxor %xmm3,%xmm0 3596 pslld \$1,%xmm4 3597 3598 pxor %xmm2,%xmm0 3599 movdqu %xmm0,(%rax) 3600 3601 dec %r10d 3602 jz .Ldone_key256 3603 3604 pshufd \$0xff,%xmm0,%xmm2 3605 pxor %xmm3,%xmm3 3606 aesenclast %xmm3,%xmm2 3607 3608 movdqa %xmm1,%xmm3 3609 pslldq \$4,%xmm1 3610 pxor %xmm1,%xmm3 3611 pslldq \$4,%xmm1 3612 pxor %xmm1,%xmm3 3613 pslldq \$4,%xmm1 3614 pxor %xmm3,%xmm1 3615 3616 pxor %xmm1,%xmm2 3617 movdqu %xmm2,16(%rax) 3618 lea 32(%rax),%rax 3619 movdqa %xmm2,%xmm1 3620 3621 jmp .Loop_key256 3622 3623 .Ldone_key256: 3624 mov $bits,16(%rax) # 240($key) 3625 xor %eax,%eax 3626 jmp .Lenc_key_ret 3627 3628 .align 16 3629 .Lbad_keybits: 3630 mov \$-2,%rax 3631 .Lenc_key_ret: 3632 pxor %xmm0,%xmm0 3633 pxor %xmm1,%xmm1 3634 pxor %xmm2,%xmm2 3635 pxor %xmm3,%xmm3 3636 pxor %xmm4,%xmm4 3637 pxor %xmm5,%xmm5 3638 add \$8,%rsp 3639 ret 3640 .LSEH_end_set_encrypt_key: 3641 3643 .align 16 3644 .Lkey_expansion_128: 3645 $movkey %xmm0,(%rax) 3646 lea 16(%rax),%rax 3647 .Lkey_expansion_128_cold: 3648 shufps \$0b00010000,%xmm0,%xmm4 3649 xorps %xmm4, %xmm0 3650 shufps \$0b10001100,%xmm0,%xmm4 3651 xorps %xmm4, %xmm0 3652 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3653 xorps %xmm1,%xmm0 3654 ret 3655 3656 .align 16 3657 .Lkey_expansion_192a: 3658 $movkey %xmm0,(%rax) 3659 lea 16(%rax),%rax 3660 .Lkey_expansion_192a_cold: 3661 movaps %xmm2, %xmm5 3662 .Lkey_expansion_192b_warm: 3663 shufps \$0b00010000,%xmm0,%xmm4 3664 movdqa %xmm2,%xmm3 3665 xorps %xmm4,%xmm0 3666 shufps \$0b10001100,%xmm0,%xmm4 3667 pslldq \$4,%xmm3 3668 xorps %xmm4,%xmm0 3669 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 3670 pxor %xmm3,%xmm2 3671 pxor %xmm1,%xmm0 3672 pshufd \$0b11111111,%xmm0,%xmm3 3673 pxor %xmm3,%xmm2 3674 ret 3675 3676 .align 16 3677 .Lkey_expansion_192b: 3678 movaps %xmm0,%xmm3 3679 shufps \$0b01000100,%xmm0,%xmm5 3680 $movkey %xmm5,(%rax) 3681 shufps \$0b01001110,%xmm2,%xmm3 3682 $movkey %xmm3,16(%rax) 3683 lea 32(%rax),%rax 3684 jmp .Lkey_expansion_192b_warm 3685 3686 .align 16 3687 .Lkey_expansion_256a: 3688 $movkey %xmm2,(%rax) 3689 lea 16(%rax),%rax 3690 .Lkey_expansion_256a_cold: 3691 shufps \$0b00010000,%xmm0,%xmm4 3692 xorps %xmm4,%xmm0 3693 shufps \$0b10001100,%xmm0,%xmm4 3694 xorps %xmm4,%xmm0 3695 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3696 xorps %xmm1,%xmm0 3697 ret 3698 3699 .align 16 3700 .Lkey_expansion_256b: 3701 $movkey %xmm0,(%rax) 3702 lea 16(%rax),%rax 3703 3704 shufps \$0b00010000,%xmm2,%xmm4 3705 xorps %xmm4,%xmm2 3706 shufps \$0b10001100,%xmm2,%xmm4 3707 xorps %xmm4,%xmm2 3708 shufps \$0b10101010,%xmm1,%xmm1 # critical path 3709 xorps %xmm1,%xmm2 3710 ret 3711 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 3712 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 3713 ___ 3714 } 3715 3717 $code.=<<___; 3718 .align 64 3719 .Lbswap_mask: 3720 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 3721 .Lincrement32: 3722 .long 6,6,6,0 3723 .Lincrement64: 3724 .long 1,0,0,0 3725 .Lxts_magic: 3726 .long 0x87,0,1,0 3727 .Lincrement1: 3728 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 3729 .Lkey_rotate: 3730 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 3731 .Lkey_rotate192: 3732 .long 0x04070605,0x04070605,0x04070605,0x04070605 3733 .Lkey_rcon1: 3734 .long 1,1,1,1 3735 .Lkey_rcon1b: 3736 .long 0x1b,0x1b,0x1b,0x1b 3737 3738 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 3739 .align 64 3740 ___ 3741 3742 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3743 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 3744 if ($win64) { 3745 $rec="%rcx"; 3746 $frame="%rdx"; 3747 $context="%r8"; 3748 $disp="%r9"; 3749 3750 $code.=<<___; 3751 .extern __imp_RtlVirtualUnwind 3752 ___ 3753 $code.=<<___ if ($PREFIX eq "aesni"); 3754 .type ecb_ccm64_se_handler,\@abi-omnipotent 3755 .align 16 3756 ecb_ccm64_se_handler: 3757 push %rsi 3758 push %rdi 3759 push %rbx 3760 push %rbp 3761 push %r12 3762 push %r13 3763 push %r14 3764 push %r15 3765 pushfq 3766 sub \$64,%rsp 3767 3768 mov 120($context),%rax # pull context->Rax 3769 mov 248($context),%rbx # pull context->Rip 3770 3771 mov 8($disp),%rsi # disp->ImageBase 3772 mov 56($disp),%r11 # disp->HandlerData 3773 3774 mov 0(%r11),%r10d # HandlerData[0] 3775 lea (%rsi,%r10),%r10 # prologue label 3776 cmp %r10,%rbx # context->Rip<prologue label 3777 jb .Lcommon_seh_tail 3778 3779 mov 152($context),%rax # pull context->Rsp 3780 3781 mov 4(%r11),%r10d # HandlerData[1] 3782 lea (%rsi,%r10),%r10 # epilogue label 3783 cmp %r10,%rbx # context->Rip>=epilogue label 3784 jae .Lcommon_seh_tail 3785 3786 lea 0(%rax),%rsi # %xmm save area 3787 lea 512($context),%rdi # &context.Xmm6 3788 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 3789 .long 0xa548f3fc # cld; rep movsq 3790 lea 0x58(%rax),%rax # adjust stack pointer 3791 3792 jmp .Lcommon_seh_tail 3793 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 3794 3795 .type ctr_xts_se_handler,\@abi-omnipotent 3796 .align 16 3797 ctr_xts_se_handler: 3798 push %rsi 3799 push %rdi 3800 push %rbx 3801 push %rbp 3802 push %r12 3803 push %r13 3804 push %r14 3805 push %r15 3806 pushfq 3807 sub \$64,%rsp 3808 3809 mov 120($context),%rax # pull context->Rax 3810 mov 248($context),%rbx # pull context->Rip 3811 3812 mov 8($disp),%rsi # disp->ImageBase 3813 mov 56($disp),%r11 # disp->HandlerData 3814 3815 mov 0(%r11),%r10d # HandlerData[0] 3816 lea (%rsi,%r10),%r10 # prologue lable 3817 cmp %r10,%rbx # context->Rip<prologue label 3818 jb .Lcommon_seh_tail 3819 3820 mov 152($context),%rax # pull context->Rsp 3821 3822 mov 4(%r11),%r10d # HandlerData[1] 3823 lea (%rsi,%r10),%r10 # epilogue label 3824 cmp %r10,%rbx # context->Rip>=epilogue label 3825 jae .Lcommon_seh_tail 3826 3827 mov 160($context),%rax # pull context->Rbp 3828 lea -0xa0(%rax),%rsi # %xmm save area 3829 lea 512($context),%rdi # & context.Xmm6 3830 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3831 .long 0xa548f3fc # cld; rep movsq 3832 3833 jmp .Lcommon_rbp_tail 3834 .size ctr_xts_se_handler,.-ctr_xts_se_handler 3835 ___ 3836 $code.=<<___; 3837 .type cbc_se_handler,\@abi-omnipotent 3838 .align 16 3839 cbc_se_handler: 3840 push %rsi 3841 push %rdi 3842 push %rbx 3843 push %rbp 3844 push %r12 3845 push %r13 3846 push %r14 3847 push %r15 3848 pushfq 3849 sub \$64,%rsp 3850 3851 mov 152($context),%rax # pull context->Rsp 3852 mov 248($context),%rbx # pull context->Rip 3853 3854 lea .Lcbc_decrypt_bulk(%rip),%r10 3855 cmp %r10,%rbx # context->Rip<"prologue" label 3856 jb .Lcommon_seh_tail 3857 3858 lea .Lcbc_decrypt_body(%rip),%r10 3859 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 3860 jb .Lrestore_cbc_rax 3861 3862 lea .Lcbc_ret(%rip),%r10 3863 cmp %r10,%rbx # context->Rip>="epilogue" label 3864 jae .Lcommon_seh_tail 3865 3866 lea 16(%rax),%rsi # %xmm save area 3867 lea 512($context),%rdi # &context.Xmm6 3868 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3869 .long 0xa548f3fc # cld; rep movsq 3870 3871 .Lcommon_rbp_tail: 3872 mov 160($context),%rax # pull context->Rbp 3873 mov (%rax),%rbp # restore saved %rbp 3874 lea 8(%rax),%rax # adjust stack pointer 3875 mov %rbp,160($context) # restore context->Rbp 3876 jmp .Lcommon_seh_tail 3877 3878 .Lrestore_cbc_rax: 3879 mov 120($context),%rax 3880 3881 .Lcommon_seh_tail: 3882 mov 8(%rax),%rdi 3883 mov 16(%rax),%rsi 3884 mov %rax,152($context) # restore context->Rsp 3885 mov %rsi,168($context) # restore context->Rsi 3886 mov %rdi,176($context) # restore context->Rdi 3887 3888 mov 40($disp),%rdi # disp->ContextRecord 3889 mov $context,%rsi # context 3890 mov \$154,%ecx # sizeof(CONTEXT) 3891 .long 0xa548f3fc # cld; rep movsq 3892 3893 mov $disp,%rsi 3894 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3895 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3896 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3897 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3898 mov 40(%rsi),%r10 # disp->ContextRecord 3899 lea 56(%rsi),%r11 # &disp->HandlerData 3900 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3901 mov %r10,32(%rsp) # arg5 3902 mov %r11,40(%rsp) # arg6 3903 mov %r12,48(%rsp) # arg7 3904 mov %rcx,56(%rsp) # arg8, (NULL) 3905 call *__imp_RtlVirtualUnwind(%rip) 3906 3907 mov \$1,%eax # ExceptionContinueSearch 3908 add \$64,%rsp 3909 popfq 3910 pop %r15 3911 pop %r14 3912 pop %r13 3913 pop %r12 3914 pop %rbp 3915 pop %rbx 3916 pop %rdi 3917 pop %rsi 3918 ret 3919 .size cbc_se_handler,.-cbc_se_handler 3920 3921 .section .pdata 3922 .align 4 3923 ___ 3924 $code.=<<___ if ($PREFIX eq "aesni"); 3925 .rva .LSEH_begin_aesni_ecb_encrypt 3926 .rva .LSEH_end_aesni_ecb_encrypt 3927 .rva .LSEH_info_ecb 3928 3929 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 3930 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 3931 .rva .LSEH_info_ccm64_enc 3932 3933 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 3934 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 3935 .rva .LSEH_info_ccm64_dec 3936 3937 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 3938 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 3939 .rva .LSEH_info_ctr32 3940 3941 .rva .LSEH_begin_aesni_xts_encrypt 3942 .rva .LSEH_end_aesni_xts_encrypt 3943 .rva .LSEH_info_xts_enc 3944 3945 .rva .LSEH_begin_aesni_xts_decrypt 3946 .rva .LSEH_end_aesni_xts_decrypt 3947 .rva .LSEH_info_xts_dec 3948 ___ 3949 $code.=<<___; 3950 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 3951 .rva .LSEH_end_${PREFIX}_cbc_encrypt 3952 .rva .LSEH_info_cbc 3953 3954 .rva ${PREFIX}_set_decrypt_key 3955 .rva .LSEH_end_set_decrypt_key 3956 .rva .LSEH_info_key 3957 3958 .rva ${PREFIX}_set_encrypt_key 3959 .rva .LSEH_end_set_encrypt_key 3960 .rva .LSEH_info_key 3961 .section .xdata 3962 .align 8 3963 ___ 3964 $code.=<<___ if ($PREFIX eq "aesni"); 3965 .LSEH_info_ecb: 3966 .byte 9,0,0,0 3967 .rva ecb_ccm64_se_handler 3968 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 3969 .LSEH_info_ccm64_enc: 3970 .byte 9,0,0,0 3971 .rva ecb_ccm64_se_handler 3972 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 3973 .LSEH_info_ccm64_dec: 3974 .byte 9,0,0,0 3975 .rva ecb_ccm64_se_handler 3976 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 3977 .LSEH_info_ctr32: 3978 .byte 9,0,0,0 3979 .rva ctr_xts_se_handler 3980 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 3981 .LSEH_info_xts_enc: 3982 .byte 9,0,0,0 3983 .rva ctr_xts_se_handler 3984 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3985 .LSEH_info_xts_dec: 3986 .byte 9,0,0,0 3987 .rva ctr_xts_se_handler 3988 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3989 ___ 3990 $code.=<<___; 3991 .LSEH_info_cbc: 3992 .byte 9,0,0,0 3993 .rva cbc_se_handler 3994 .LSEH_info_key: 3995 .byte 0x01,0x04,0x01,0x00 3996 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 3997 ___ 3998 } 3999 4000 sub rex { 4001 local *opcode=shift; 4002 my ($dst,$src)=@_; 4003 my $rex=0; 4004 4005 $rex|=0x04 if($dst>=8); 4006 $rex|=0x01 if($src>=8); 4007 push @opcode,$rex|0x40 if($rex); 4008 } 4009 4010 sub aesni { 4011 my $line=shift; 4012 my @opcode=(0x66); 4013 4014 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 4015 rex(\@opcode,$4,$3); 4016 push @opcode,0x0f,0x3a,0xdf; 4017 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 4018 my $c=$2; 4019 push @opcode,$c=~/^0/?oct($c):$c; 4020 return ".byte\t".join(',',@opcode); 4021 } 4022 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 4023 my %opcodelet = ( 4024 "aesimc" => 0xdb, 4025 "aesenc" => 0xdc, "aesenclast" => 0xdd, 4026 "aesdec" => 0xde, "aesdeclast" => 0xdf 4027 ); 4028 return undef if (!defined($opcodelet{$1})); 4029 rex(\@opcode,$3,$2); 4030 push @opcode,0x0f,0x38,$opcodelet{$1}; 4031 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 4032 return ".byte\t".join(',',@opcode); 4033 } 4034 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 4035 my %opcodelet = ( 4036 "aesenc" => 0xdc, "aesenclast" => 0xdd, 4037 "aesdec" => 0xde, "aesdeclast" => 0xdf 4038 ); 4039 return undef if (!defined($opcodelet{$1})); 4040 my $off = $2; 4041 push @opcode,0x44 if ($3>=8); 4042 push @opcode,0x0f,0x38,$opcodelet{$1}; 4043 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 4044 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 4045 return ".byte\t".join(',',@opcode); 4046 } 4047 return $line; 4048 } 4049 4050 sub movbe { 4051 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 4052 } 4053 4054 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 4055 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 4056 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 4057 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 4058 4059 print $code; 4060 4061 close STDOUT; 4062