1 #! /usr/bin/env perl 2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 # 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # ==================================================================== 16 # 17 # This module implements support for Intel AES-NI extension. In 18 # OpenSSL context it's used with Intel engine, but can also be used as 19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20 # details]. 21 # 22 # Performance. 23 # 24 # Given aes(enc|dec) instructions' latency asymptotic performance for 25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26 # processed with 128-bit key. And given their throughput asymptotic 27 # performance for parallelizable modes is 1.25 cycles per byte. Being 28 # asymptotic limit it's not something you commonly achieve in reality, 29 # but how close does one get? Below are results collected for 30 # different modes and block sized. Pairs of numbers are for en-/ 31 # decryption. 32 # 33 # 16-byte 64-byte 256-byte 1-KB 8-KB 34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40 # 41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44 # The results were collected with specially crafted speed.c benchmark 45 # in order to compare them with results reported in "Intel Advanced 46 # Encryption Standard (AES) New Instruction Set" White Paper Revision 47 # 3.0 dated May 2010. All above results are consistently better. This 48 # module also provides better performance for block sizes smaller than 49 # 128 bytes in points *not* represented in the above table. 50 # 51 # Looking at the results for 8-KB buffer. 52 # 53 # CFB and OFB results are far from the limit, because implementation 54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55 # single-block aesni_encrypt, which is not the most optimal way to go. 56 # CBC encrypt result is unexpectedly high and there is no documented 57 # explanation for it. Seemingly there is a small penalty for feeding 58 # the result back to AES unit the way it's done in CBC mode. There is 59 # nothing one can do and the result appears optimal. CCM result is 60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without 61 # saving output. CCM CTR "stays invisible," because it's neatly 62 # interleaved wih CBC-MAC. This provides ~30% improvement over 63 # "straghtforward" CCM implementation with CTR and CBC-MAC performed 64 # disjointly. Parallelizable modes practically achieve the theoretical 65 # limit. 66 # 67 # Looking at how results vary with buffer size. 68 # 69 # Curves are practically saturated at 1-KB buffer size. In most cases 70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71 # CTR curve doesn't follow this pattern and is "slowest" changing one 72 # with "256-byte" result being 87% of "8-KB." This is because overhead 73 # in CTR mode is most computationally intensive. Small-block CCM 74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC 75 # iterations can't be interleaved. 76 # 77 # Results for 192- and 256-bit keys. 78 # 79 # EVP-free results were observed to scale perfectly with number of 80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82 # are a tad smaller, because the above mentioned penalty biases all 83 # results by same constant value. In similar way function call 84 # overhead affects small-block performance, as well as OFB and CFB 85 # results. Differences are not large, most common coefficients are 86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89 # January 2011 90 # 91 # While Westmere processor features 6 cycles latency for aes[enc|dec] 92 # instructions, which can be scheduled every second cycle, Sandy 93 # Bridge spends 8 cycles per instruction, but it can schedule them 94 # every cycle. This means that code targeting Westmere would perform 95 # suboptimally on Sandy Bridge. Therefore this update. 96 # 97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is 98 # optimized. Relative improvement might appear modest, 8% on Westmere, 99 # but in absolute terms it's 3.77 cycles per byte encrypted with 100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101 # should be compared to asymptotic limits of 3.75 for Westmere and 102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close 103 # to asymptotic limits is quite amazing. Indeed, the limit is 104 # calculated as latency times number of rounds, 10 for 128-bit key, 105 # and divided by 16, the number of bytes in block, or in other words 106 # it accounts *solely* for aesenc instructions. But there are extra 107 # instructions, and numbers so close to the asymptotic limits mean 108 # that it's as if it takes as little as *one* additional cycle to 109 # execute all of them. How is it possible? It is possible thanks to 110 # out-of-order execution logic, which manages to overlap post- 111 # processing of previous block, things like saving the output, with 112 # actual encryption of current block, as well as pre-processing of 113 # current block, things like fetching input and xor-ing it with 114 # 0-round element of the key schedule, with actual encryption of 115 # previous block. Keep this in mind... 116 # 117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118 # performance is achieved by interleaving instructions working on 119 # independent blocks. In which case asymptotic limit for such modes 120 # can be obtained by dividing above mentioned numbers by AES 121 # instructions' interleave factor. Westmere can execute at most 3 122 # instructions at a time, meaning that optimal interleave factor is 3, 123 # and that's where the "magic" number of 1.25 come from. "Optimal 124 # interleave factor" means that increase of interleave factor does 125 # not improve performance. The formula has proven to reflect reality 126 # pretty well on Westmere... Sandy Bridge on the other hand can 127 # execute up to 8 AES instructions at a time, so how does varying 128 # interleave factor affect the performance? Here is table for ECB 129 # (numbers are cycles per byte processed with 128-bit key): 130 # 131 # instruction interleave factor 3x 6x 8x 132 # theoretical asymptotic limit 1.67 0.83 0.625 133 # measured performance for 8KB block 1.05 0.86 0.84 134 # 135 # "as if" interleave factor 4.7x 5.8x 6.0x 136 # 137 # Further data for other parallelizable modes: 138 # 139 # CBC decrypt 1.16 0.93 0.74 140 # CTR 1.14 0.91 0.74 141 # 142 # Well, given 3x column it's probably inappropriate to call the limit 143 # asymptotic, if it can be surpassed, isn't it? What happens there? 144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145 # magic is responsible for this. Processor overlaps not only the 146 # additional instructions with AES ones, but even AES instuctions 147 # processing adjacent triplets of independent blocks. In the 6x case 148 # additional instructions still claim disproportionally small amount 149 # of additional cycles, but in 8x case number of instructions must be 150 # a tad too high for out-of-order logic to cope with, and AES unit 151 # remains underutilized... As you can see 8x interleave is hardly 152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153 # utilizies 6x interleave because of limited register bank capacity. 154 # 155 # Higher interleave factors do have negative impact on Westmere 156 # performance. While for ECB mode it's negligible ~1.5%, other 157 # parallelizables perform ~5% worse, which is outweighed by ~25% 158 # improvement on Sandy Bridge. To balance regression on Westmere 159 # CTR mode was implemented with 6x aesenc interleave factor. 160 161 # April 2011 162 # 163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165 # in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167 # November 2015 168 # 169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170 # chosen to be 6x. 171 172 ###################################################################### 173 # Current large-block performance in cycles per byte processed with 174 # 128-bit key (less is better). 175 # 176 # CBC en-/decrypt CTR XTS ECB OCB 177 # Westmere 3.77/1.25 1.25 1.25 1.26 178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180 # Skylake 2.62/0.63 0.63 0.63 0.63 181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182 # Knights L 2.54/0.77 0.78 0.85 - 1.50 183 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185 # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186 # 187 # (*) Atom Silvermont ECB result is suboptimal because of penalties 188 # incurred by operations on %xmm8-15. As ECB is not considered 189 # critical, nothing was done to mitigate the problem. 190 191 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195 $flavour = shift; 196 $output = shift; 197 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204 die "can't locate x86_64-xlate.pl"; 205 206 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207 *STDOUT=*OUT; 208 209 $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 210 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213 $code=".text\n"; 214 $code.=".extern OPENSSL_ia32cap_P\n"; 215 216 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218 $inp="%rdi"; 219 $out="%rsi"; 220 $len="%rdx"; 221 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222 $ivp="%r8"; # cbc, ctr, ... 223 224 $rnds_="%r10d"; # backup copy for $rounds 225 $key_="%r11"; # backup copy for $key 226 227 # %xmm register layout 228 $rndkey0="%xmm0"; $rndkey1="%xmm1"; 229 $inout0="%xmm2"; $inout1="%xmm3"; 230 $inout2="%xmm4"; $inout3="%xmm5"; 231 $inout4="%xmm6"; $inout5="%xmm7"; 232 $inout6="%xmm8"; $inout7="%xmm9"; 233 234 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235 $in0="%xmm8"; $iv="%xmm9"; 236 238 # Inline version of internal aesni_[en|de]crypt1. 239 # 240 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate 241 # cycles which take care of loop variables... 242 { my $sn; 243 sub aesni_generate1 { 244 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 245 ++$sn; 246 $code.=<<___; 247 $movkey ($key),$rndkey0 248 $movkey 16($key),$rndkey1 249 ___ 250 $code.=<<___ if (defined($ivec)); 251 xorps $rndkey0,$ivec 252 lea 32($key),$key 253 xorps $ivec,$inout 254 ___ 255 $code.=<<___ if (!defined($ivec)); 256 lea 32($key),$key 257 xorps $rndkey0,$inout 258 ___ 259 $code.=<<___; 260 .Loop_${p}1_$sn: 261 aes${p} $rndkey1,$inout 262 dec $rounds 263 $movkey ($key),$rndkey1 264 lea 16($key),$key 265 jnz .Loop_${p}1_$sn # loop body is 16 bytes 266 aes${p}last $rndkey1,$inout 267 ___ 268 }} 269 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 270 # 271 { my ($inp,$out,$key) = @_4args; 272 273 $code.=<<___; 274 .globl ${PREFIX}_encrypt 275 .type ${PREFIX}_encrypt,\@abi-omnipotent 276 .align 16 277 ${PREFIX}_encrypt: 278 movups ($inp),$inout0 # load input 279 mov 240($key),$rounds # key->rounds 280 ___ 281 &aesni_generate1("enc",$key,$rounds); 282 $code.=<<___; 283 pxor $rndkey0,$rndkey0 # clear register bank 284 pxor $rndkey1,$rndkey1 285 movups $inout0,($out) # output 286 pxor $inout0,$inout0 287 ret 288 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 289 290 .globl ${PREFIX}_decrypt 291 .type ${PREFIX}_decrypt,\@abi-omnipotent 292 .align 16 293 ${PREFIX}_decrypt: 294 movups ($inp),$inout0 # load input 295 mov 240($key),$rounds # key->rounds 296 ___ 297 &aesni_generate1("dec",$key,$rounds); 298 $code.=<<___; 299 pxor $rndkey0,$rndkey0 # clear register bank 300 pxor $rndkey1,$rndkey1 301 movups $inout0,($out) # output 302 pxor $inout0,$inout0 303 ret 304 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 305 ___ 306 } 307 309 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 310 # factor. Why 3x subroutine were originally used in loops? Even though 311 # aes[enc|dec] latency was originally 6, it could be scheduled only 312 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 313 # utilization, i.e. when subroutine's throughput is virtually same as 314 # of non-interleaved subroutine [for number of input blocks up to 3]. 315 # This is why it originally made no sense to implement 2x subroutine. 316 # But times change and it became appropriate to spend extra 192 bytes 317 # on 2x subroutine on Atom Silvermont account. For processors that 318 # can schedule aes[enc|dec] every cycle optimal interleave factor 319 # equals to corresponding instructions latency. 8x is optimal for 320 # * Bridge and "super-optimal" for other Intel CPUs... 321 322 sub aesni_generate2 { 323 my $dir=shift; 324 # As already mentioned it takes in $key and $rounds, which are *not* 325 # preserved. $inout[0-1] is cipher/clear text... 326 $code.=<<___; 327 .type _aesni_${dir}rypt2,\@abi-omnipotent 328 .align 16 329 _aesni_${dir}rypt2: 330 $movkey ($key),$rndkey0 331 shl \$4,$rounds 332 $movkey 16($key),$rndkey1 333 xorps $rndkey0,$inout0 334 xorps $rndkey0,$inout1 335 $movkey 32($key),$rndkey0 336 lea 32($key,$rounds),$key 337 neg %rax # $rounds 338 add \$16,%rax 339 340 .L${dir}_loop2: 341 aes${dir} $rndkey1,$inout0 342 aes${dir} $rndkey1,$inout1 343 $movkey ($key,%rax),$rndkey1 344 add \$32,%rax 345 aes${dir} $rndkey0,$inout0 346 aes${dir} $rndkey0,$inout1 347 $movkey -16($key,%rax),$rndkey0 348 jnz .L${dir}_loop2 349 350 aes${dir} $rndkey1,$inout0 351 aes${dir} $rndkey1,$inout1 352 aes${dir}last $rndkey0,$inout0 353 aes${dir}last $rndkey0,$inout1 354 ret 355 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 356 ___ 357 } 358 sub aesni_generate3 { 359 my $dir=shift; 360 # As already mentioned it takes in $key and $rounds, which are *not* 361 # preserved. $inout[0-2] is cipher/clear text... 362 $code.=<<___; 363 .type _aesni_${dir}rypt3,\@abi-omnipotent 364 .align 16 365 _aesni_${dir}rypt3: 366 $movkey ($key),$rndkey0 367 shl \$4,$rounds 368 $movkey 16($key),$rndkey1 369 xorps $rndkey0,$inout0 370 xorps $rndkey0,$inout1 371 xorps $rndkey0,$inout2 372 $movkey 32($key),$rndkey0 373 lea 32($key,$rounds),$key 374 neg %rax # $rounds 375 add \$16,%rax 376 377 .L${dir}_loop3: 378 aes${dir} $rndkey1,$inout0 379 aes${dir} $rndkey1,$inout1 380 aes${dir} $rndkey1,$inout2 381 $movkey ($key,%rax),$rndkey1 382 add \$32,%rax 383 aes${dir} $rndkey0,$inout0 384 aes${dir} $rndkey0,$inout1 385 aes${dir} $rndkey0,$inout2 386 $movkey -16($key,%rax),$rndkey0 387 jnz .L${dir}_loop3 388 389 aes${dir} $rndkey1,$inout0 390 aes${dir} $rndkey1,$inout1 391 aes${dir} $rndkey1,$inout2 392 aes${dir}last $rndkey0,$inout0 393 aes${dir}last $rndkey0,$inout1 394 aes${dir}last $rndkey0,$inout2 395 ret 396 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 397 ___ 398 } 399 # 4x interleave is implemented to improve small block performance, 400 # most notably [and naturally] 4 block by ~30%. One can argue that one 401 # should have implemented 5x as well, but improvement would be <20%, 402 # so it's not worth it... 403 sub aesni_generate4 { 404 my $dir=shift; 405 # As already mentioned it takes in $key and $rounds, which are *not* 406 # preserved. $inout[0-3] is cipher/clear text... 407 $code.=<<___; 408 .type _aesni_${dir}rypt4,\@abi-omnipotent 409 .align 16 410 _aesni_${dir}rypt4: 411 $movkey ($key),$rndkey0 412 shl \$4,$rounds 413 $movkey 16($key),$rndkey1 414 xorps $rndkey0,$inout0 415 xorps $rndkey0,$inout1 416 xorps $rndkey0,$inout2 417 xorps $rndkey0,$inout3 418 $movkey 32($key),$rndkey0 419 lea 32($key,$rounds),$key 420 neg %rax # $rounds 421 .byte 0x0f,0x1f,0x00 422 add \$16,%rax 423 424 .L${dir}_loop4: 425 aes${dir} $rndkey1,$inout0 426 aes${dir} $rndkey1,$inout1 427 aes${dir} $rndkey1,$inout2 428 aes${dir} $rndkey1,$inout3 429 $movkey ($key,%rax),$rndkey1 430 add \$32,%rax 431 aes${dir} $rndkey0,$inout0 432 aes${dir} $rndkey0,$inout1 433 aes${dir} $rndkey0,$inout2 434 aes${dir} $rndkey0,$inout3 435 $movkey -16($key,%rax),$rndkey0 436 jnz .L${dir}_loop4 437 438 aes${dir} $rndkey1,$inout0 439 aes${dir} $rndkey1,$inout1 440 aes${dir} $rndkey1,$inout2 441 aes${dir} $rndkey1,$inout3 442 aes${dir}last $rndkey0,$inout0 443 aes${dir}last $rndkey0,$inout1 444 aes${dir}last $rndkey0,$inout2 445 aes${dir}last $rndkey0,$inout3 446 ret 447 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 448 ___ 449 } 450 sub aesni_generate6 { 451 my $dir=shift; 452 # As already mentioned it takes in $key and $rounds, which are *not* 453 # preserved. $inout[0-5] is cipher/clear text... 454 $code.=<<___; 455 .type _aesni_${dir}rypt6,\@abi-omnipotent 456 .align 16 457 _aesni_${dir}rypt6: 458 $movkey ($key),$rndkey0 459 shl \$4,$rounds 460 $movkey 16($key),$rndkey1 461 xorps $rndkey0,$inout0 462 pxor $rndkey0,$inout1 463 pxor $rndkey0,$inout2 464 aes${dir} $rndkey1,$inout0 465 lea 32($key,$rounds),$key 466 neg %rax # $rounds 467 aes${dir} $rndkey1,$inout1 468 pxor $rndkey0,$inout3 469 pxor $rndkey0,$inout4 470 aes${dir} $rndkey1,$inout2 471 pxor $rndkey0,$inout5 472 $movkey ($key,%rax),$rndkey0 473 add \$16,%rax 474 jmp .L${dir}_loop6_enter 475 .align 16 476 .L${dir}_loop6: 477 aes${dir} $rndkey1,$inout0 478 aes${dir} $rndkey1,$inout1 479 aes${dir} $rndkey1,$inout2 480 .L${dir}_loop6_enter: 481 aes${dir} $rndkey1,$inout3 482 aes${dir} $rndkey1,$inout4 483 aes${dir} $rndkey1,$inout5 484 $movkey ($key,%rax),$rndkey1 485 add \$32,%rax 486 aes${dir} $rndkey0,$inout0 487 aes${dir} $rndkey0,$inout1 488 aes${dir} $rndkey0,$inout2 489 aes${dir} $rndkey0,$inout3 490 aes${dir} $rndkey0,$inout4 491 aes${dir} $rndkey0,$inout5 492 $movkey -16($key,%rax),$rndkey0 493 jnz .L${dir}_loop6 494 495 aes${dir} $rndkey1,$inout0 496 aes${dir} $rndkey1,$inout1 497 aes${dir} $rndkey1,$inout2 498 aes${dir} $rndkey1,$inout3 499 aes${dir} $rndkey1,$inout4 500 aes${dir} $rndkey1,$inout5 501 aes${dir}last $rndkey0,$inout0 502 aes${dir}last $rndkey0,$inout1 503 aes${dir}last $rndkey0,$inout2 504 aes${dir}last $rndkey0,$inout3 505 aes${dir}last $rndkey0,$inout4 506 aes${dir}last $rndkey0,$inout5 507 ret 508 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 509 ___ 510 } 511 sub aesni_generate8 { 512 my $dir=shift; 513 # As already mentioned it takes in $key and $rounds, which are *not* 514 # preserved. $inout[0-7] is cipher/clear text... 515 $code.=<<___; 516 .type _aesni_${dir}rypt8,\@abi-omnipotent 517 .align 16 518 _aesni_${dir}rypt8: 519 $movkey ($key),$rndkey0 520 shl \$4,$rounds 521 $movkey 16($key),$rndkey1 522 xorps $rndkey0,$inout0 523 xorps $rndkey0,$inout1 524 pxor $rndkey0,$inout2 525 pxor $rndkey0,$inout3 526 pxor $rndkey0,$inout4 527 lea 32($key,$rounds),$key 528 neg %rax # $rounds 529 aes${dir} $rndkey1,$inout0 530 pxor $rndkey0,$inout5 531 pxor $rndkey0,$inout6 532 aes${dir} $rndkey1,$inout1 533 pxor $rndkey0,$inout7 534 $movkey ($key,%rax),$rndkey0 535 add \$16,%rax 536 jmp .L${dir}_loop8_inner 537 .align 16 538 .L${dir}_loop8: 539 aes${dir} $rndkey1,$inout0 540 aes${dir} $rndkey1,$inout1 541 .L${dir}_loop8_inner: 542 aes${dir} $rndkey1,$inout2 543 aes${dir} $rndkey1,$inout3 544 aes${dir} $rndkey1,$inout4 545 aes${dir} $rndkey1,$inout5 546 aes${dir} $rndkey1,$inout6 547 aes${dir} $rndkey1,$inout7 548 .L${dir}_loop8_enter: 549 $movkey ($key,%rax),$rndkey1 550 add \$32,%rax 551 aes${dir} $rndkey0,$inout0 552 aes${dir} $rndkey0,$inout1 553 aes${dir} $rndkey0,$inout2 554 aes${dir} $rndkey0,$inout3 555 aes${dir} $rndkey0,$inout4 556 aes${dir} $rndkey0,$inout5 557 aes${dir} $rndkey0,$inout6 558 aes${dir} $rndkey0,$inout7 559 $movkey -16($key,%rax),$rndkey0 560 jnz .L${dir}_loop8 561 562 aes${dir} $rndkey1,$inout0 563 aes${dir} $rndkey1,$inout1 564 aes${dir} $rndkey1,$inout2 565 aes${dir} $rndkey1,$inout3 566 aes${dir} $rndkey1,$inout4 567 aes${dir} $rndkey1,$inout5 568 aes${dir} $rndkey1,$inout6 569 aes${dir} $rndkey1,$inout7 570 aes${dir}last $rndkey0,$inout0 571 aes${dir}last $rndkey0,$inout1 572 aes${dir}last $rndkey0,$inout2 573 aes${dir}last $rndkey0,$inout3 574 aes${dir}last $rndkey0,$inout4 575 aes${dir}last $rndkey0,$inout5 576 aes${dir}last $rndkey0,$inout6 577 aes${dir}last $rndkey0,$inout7 578 ret 579 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 580 ___ 581 } 582 &aesni_generate2("enc") if ($PREFIX eq "aesni"); 583 &aesni_generate2("dec"); 584 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 585 &aesni_generate3("dec"); 586 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 587 &aesni_generate4("dec"); 588 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 589 &aesni_generate6("dec"); 590 &aesni_generate8("enc") if ($PREFIX eq "aesni"); 591 &aesni_generate8("dec"); 592 594 if ($PREFIX eq "aesni") { 595 ######################################################################## 596 # void aesni_ecb_encrypt (const void *in, void *out, 597 # size_t length, const AES_KEY *key, 598 # int enc); 599 $code.=<<___; 600 .globl aesni_ecb_encrypt 601 .type aesni_ecb_encrypt,\@function,5 602 .align 16 603 aesni_ecb_encrypt: 604 ___ 605 $code.=<<___ if ($win64); 606 lea -0x58(%rsp),%rsp 607 movaps %xmm6,(%rsp) # offload $inout4..7 608 movaps %xmm7,0x10(%rsp) 609 movaps %xmm8,0x20(%rsp) 610 movaps %xmm9,0x30(%rsp) 611 .Lecb_enc_body: 612 ___ 613 $code.=<<___; 614 and \$-16,$len # if ($len<16) 615 jz .Lecb_ret # return 616 617 mov 240($key),$rounds # key->rounds 618 $movkey ($key),$rndkey0 619 mov $key,$key_ # backup $key 620 mov $rounds,$rnds_ # backup $rounds 621 test %r8d,%r8d # 5th argument 622 jz .Lecb_decrypt 623 #--------------------------- ECB ENCRYPT ------------------------------# 624 cmp \$0x80,$len # if ($len<8*16) 625 jb .Lecb_enc_tail # short input 626 627 movdqu ($inp),$inout0 # load 8 input blocks 628 movdqu 0x10($inp),$inout1 629 movdqu 0x20($inp),$inout2 630 movdqu 0x30($inp),$inout3 631 movdqu 0x40($inp),$inout4 632 movdqu 0x50($inp),$inout5 633 movdqu 0x60($inp),$inout6 634 movdqu 0x70($inp),$inout7 635 lea 0x80($inp),$inp # $inp+=8*16 636 sub \$0x80,$len # $len-=8*16 (can be zero) 637 jmp .Lecb_enc_loop8_enter 638 .align 16 639 .Lecb_enc_loop8: 640 movups $inout0,($out) # store 8 output blocks 641 mov $key_,$key # restore $key 642 movdqu ($inp),$inout0 # load 8 input blocks 643 mov $rnds_,$rounds # restore $rounds 644 movups $inout1,0x10($out) 645 movdqu 0x10($inp),$inout1 646 movups $inout2,0x20($out) 647 movdqu 0x20($inp),$inout2 648 movups $inout3,0x30($out) 649 movdqu 0x30($inp),$inout3 650 movups $inout4,0x40($out) 651 movdqu 0x40($inp),$inout4 652 movups $inout5,0x50($out) 653 movdqu 0x50($inp),$inout5 654 movups $inout6,0x60($out) 655 movdqu 0x60($inp),$inout6 656 movups $inout7,0x70($out) 657 lea 0x80($out),$out # $out+=8*16 658 movdqu 0x70($inp),$inout7 659 lea 0x80($inp),$inp # $inp+=8*16 660 .Lecb_enc_loop8_enter: 661 662 call _aesni_encrypt8 663 664 sub \$0x80,$len 665 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 666 667 movups $inout0,($out) # store 8 output blocks 668 mov $key_,$key # restore $key 669 movups $inout1,0x10($out) 670 mov $rnds_,$rounds # restore $rounds 671 movups $inout2,0x20($out) 672 movups $inout3,0x30($out) 673 movups $inout4,0x40($out) 674 movups $inout5,0x50($out) 675 movups $inout6,0x60($out) 676 movups $inout7,0x70($out) 677 lea 0x80($out),$out # $out+=8*16 678 add \$0x80,$len # restore real remaining $len 679 jz .Lecb_ret # done if ($len==0) 680 681 .Lecb_enc_tail: # $len is less than 8*16 682 movups ($inp),$inout0 683 cmp \$0x20,$len 684 jb .Lecb_enc_one 685 movups 0x10($inp),$inout1 686 je .Lecb_enc_two 687 movups 0x20($inp),$inout2 688 cmp \$0x40,$len 689 jb .Lecb_enc_three 690 movups 0x30($inp),$inout3 691 je .Lecb_enc_four 692 movups 0x40($inp),$inout4 693 cmp \$0x60,$len 694 jb .Lecb_enc_five 695 movups 0x50($inp),$inout5 696 je .Lecb_enc_six 697 movdqu 0x60($inp),$inout6 698 xorps $inout7,$inout7 699 call _aesni_encrypt8 700 movups $inout0,($out) # store 7 output blocks 701 movups $inout1,0x10($out) 702 movups $inout2,0x20($out) 703 movups $inout3,0x30($out) 704 movups $inout4,0x40($out) 705 movups $inout5,0x50($out) 706 movups $inout6,0x60($out) 707 jmp .Lecb_ret 708 .align 16 709 .Lecb_enc_one: 710 ___ 711 &aesni_generate1("enc",$key,$rounds); 712 $code.=<<___; 713 movups $inout0,($out) # store one output block 714 jmp .Lecb_ret 715 .align 16 716 .Lecb_enc_two: 717 call _aesni_encrypt2 718 movups $inout0,($out) # store 2 output blocks 719 movups $inout1,0x10($out) 720 jmp .Lecb_ret 721 .align 16 722 .Lecb_enc_three: 723 call _aesni_encrypt3 724 movups $inout0,($out) # store 3 output blocks 725 movups $inout1,0x10($out) 726 movups $inout2,0x20($out) 727 jmp .Lecb_ret 728 .align 16 729 .Lecb_enc_four: 730 call _aesni_encrypt4 731 movups $inout0,($out) # store 4 output blocks 732 movups $inout1,0x10($out) 733 movups $inout2,0x20($out) 734 movups $inout3,0x30($out) 735 jmp .Lecb_ret 736 .align 16 737 .Lecb_enc_five: 738 xorps $inout5,$inout5 739 call _aesni_encrypt6 740 movups $inout0,($out) # store 5 output blocks 741 movups $inout1,0x10($out) 742 movups $inout2,0x20($out) 743 movups $inout3,0x30($out) 744 movups $inout4,0x40($out) 745 jmp .Lecb_ret 746 .align 16 747 .Lecb_enc_six: 748 call _aesni_encrypt6 749 movups $inout0,($out) # store 6 output blocks 750 movups $inout1,0x10($out) 751 movups $inout2,0x20($out) 752 movups $inout3,0x30($out) 753 movups $inout4,0x40($out) 754 movups $inout5,0x50($out) 755 jmp .Lecb_ret 756 #--------------------------- ECB DECRYPT ------------------------------# 758 .align 16 759 .Lecb_decrypt: 760 cmp \$0x80,$len # if ($len<8*16) 761 jb .Lecb_dec_tail # short input 762 763 movdqu ($inp),$inout0 # load 8 input blocks 764 movdqu 0x10($inp),$inout1 765 movdqu 0x20($inp),$inout2 766 movdqu 0x30($inp),$inout3 767 movdqu 0x40($inp),$inout4 768 movdqu 0x50($inp),$inout5 769 movdqu 0x60($inp),$inout6 770 movdqu 0x70($inp),$inout7 771 lea 0x80($inp),$inp # $inp+=8*16 772 sub \$0x80,$len # $len-=8*16 (can be zero) 773 jmp .Lecb_dec_loop8_enter 774 .align 16 775 .Lecb_dec_loop8: 776 movups $inout0,($out) # store 8 output blocks 777 mov $key_,$key # restore $key 778 movdqu ($inp),$inout0 # load 8 input blocks 779 mov $rnds_,$rounds # restore $rounds 780 movups $inout1,0x10($out) 781 movdqu 0x10($inp),$inout1 782 movups $inout2,0x20($out) 783 movdqu 0x20($inp),$inout2 784 movups $inout3,0x30($out) 785 movdqu 0x30($inp),$inout3 786 movups $inout4,0x40($out) 787 movdqu 0x40($inp),$inout4 788 movups $inout5,0x50($out) 789 movdqu 0x50($inp),$inout5 790 movups $inout6,0x60($out) 791 movdqu 0x60($inp),$inout6 792 movups $inout7,0x70($out) 793 lea 0x80($out),$out # $out+=8*16 794 movdqu 0x70($inp),$inout7 795 lea 0x80($inp),$inp # $inp+=8*16 796 .Lecb_dec_loop8_enter: 797 798 call _aesni_decrypt8 799 800 $movkey ($key_),$rndkey0 801 sub \$0x80,$len 802 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 803 804 movups $inout0,($out) # store 8 output blocks 805 pxor $inout0,$inout0 # clear register bank 806 mov $key_,$key # restore $key 807 movups $inout1,0x10($out) 808 pxor $inout1,$inout1 809 mov $rnds_,$rounds # restore $rounds 810 movups $inout2,0x20($out) 811 pxor $inout2,$inout2 812 movups $inout3,0x30($out) 813 pxor $inout3,$inout3 814 movups $inout4,0x40($out) 815 pxor $inout4,$inout4 816 movups $inout5,0x50($out) 817 pxor $inout5,$inout5 818 movups $inout6,0x60($out) 819 pxor $inout6,$inout6 820 movups $inout7,0x70($out) 821 pxor $inout7,$inout7 822 lea 0x80($out),$out # $out+=8*16 823 add \$0x80,$len # restore real remaining $len 824 jz .Lecb_ret # done if ($len==0) 825 826 .Lecb_dec_tail: 827 movups ($inp),$inout0 828 cmp \$0x20,$len 829 jb .Lecb_dec_one 830 movups 0x10($inp),$inout1 831 je .Lecb_dec_two 832 movups 0x20($inp),$inout2 833 cmp \$0x40,$len 834 jb .Lecb_dec_three 835 movups 0x30($inp),$inout3 836 je .Lecb_dec_four 837 movups 0x40($inp),$inout4 838 cmp \$0x60,$len 839 jb .Lecb_dec_five 840 movups 0x50($inp),$inout5 841 je .Lecb_dec_six 842 movups 0x60($inp),$inout6 843 $movkey ($key),$rndkey0 844 xorps $inout7,$inout7 845 call _aesni_decrypt8 846 movups $inout0,($out) # store 7 output blocks 847 pxor $inout0,$inout0 # clear register bank 848 movups $inout1,0x10($out) 849 pxor $inout1,$inout1 850 movups $inout2,0x20($out) 851 pxor $inout2,$inout2 852 movups $inout3,0x30($out) 853 pxor $inout3,$inout3 854 movups $inout4,0x40($out) 855 pxor $inout4,$inout4 856 movups $inout5,0x50($out) 857 pxor $inout5,$inout5 858 movups $inout6,0x60($out) 859 pxor $inout6,$inout6 860 pxor $inout7,$inout7 861 jmp .Lecb_ret 862 .align 16 863 .Lecb_dec_one: 864 ___ 865 &aesni_generate1("dec",$key,$rounds); 866 $code.=<<___; 867 movups $inout0,($out) # store one output block 868 pxor $inout0,$inout0 # clear register bank 869 jmp .Lecb_ret 870 .align 16 871 .Lecb_dec_two: 872 call _aesni_decrypt2 873 movups $inout0,($out) # store 2 output blocks 874 pxor $inout0,$inout0 # clear register bank 875 movups $inout1,0x10($out) 876 pxor $inout1,$inout1 877 jmp .Lecb_ret 878 .align 16 879 .Lecb_dec_three: 880 call _aesni_decrypt3 881 movups $inout0,($out) # store 3 output blocks 882 pxor $inout0,$inout0 # clear register bank 883 movups $inout1,0x10($out) 884 pxor $inout1,$inout1 885 movups $inout2,0x20($out) 886 pxor $inout2,$inout2 887 jmp .Lecb_ret 888 .align 16 889 .Lecb_dec_four: 890 call _aesni_decrypt4 891 movups $inout0,($out) # store 4 output blocks 892 pxor $inout0,$inout0 # clear register bank 893 movups $inout1,0x10($out) 894 pxor $inout1,$inout1 895 movups $inout2,0x20($out) 896 pxor $inout2,$inout2 897 movups $inout3,0x30($out) 898 pxor $inout3,$inout3 899 jmp .Lecb_ret 900 .align 16 901 .Lecb_dec_five: 902 xorps $inout5,$inout5 903 call _aesni_decrypt6 904 movups $inout0,($out) # store 5 output blocks 905 pxor $inout0,$inout0 # clear register bank 906 movups $inout1,0x10($out) 907 pxor $inout1,$inout1 908 movups $inout2,0x20($out) 909 pxor $inout2,$inout2 910 movups $inout3,0x30($out) 911 pxor $inout3,$inout3 912 movups $inout4,0x40($out) 913 pxor $inout4,$inout4 914 pxor $inout5,$inout5 915 jmp .Lecb_ret 916 .align 16 917 .Lecb_dec_six: 918 call _aesni_decrypt6 919 movups $inout0,($out) # store 6 output blocks 920 pxor $inout0,$inout0 # clear register bank 921 movups $inout1,0x10($out) 922 pxor $inout1,$inout1 923 movups $inout2,0x20($out) 924 pxor $inout2,$inout2 925 movups $inout3,0x30($out) 926 pxor $inout3,$inout3 927 movups $inout4,0x40($out) 928 pxor $inout4,$inout4 929 movups $inout5,0x50($out) 930 pxor $inout5,$inout5 931 932 .Lecb_ret: 933 xorps $rndkey0,$rndkey0 # %xmm0 934 pxor $rndkey1,$rndkey1 935 ___ 936 $code.=<<___ if ($win64); 937 movaps (%rsp),%xmm6 938 movaps %xmm0,(%rsp) # clear stack 939 movaps 0x10(%rsp),%xmm7 940 movaps %xmm0,0x10(%rsp) 941 movaps 0x20(%rsp),%xmm8 942 movaps %xmm0,0x20(%rsp) 943 movaps 0x30(%rsp),%xmm9 944 movaps %xmm0,0x30(%rsp) 945 lea 0x58(%rsp),%rsp 946 .Lecb_enc_ret: 947 ___ 948 $code.=<<___; 949 ret 950 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt 951 ___ 952 954 { 955 ###################################################################### 956 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 957 # size_t blocks, const AES_KEY *key, 958 # const char *ivec,char *cmac); 959 # 960 # Handles only complete blocks, operates on 64-bit counter and 961 # does not update *ivec! Nor does it finalize CMAC value 962 # (see engine/eng_aesni.c for details) 963 # 964 { 965 my $cmac="%r9"; # 6th argument 966 967 my $increment="%xmm9"; 968 my $iv="%xmm6"; 969 my $bswap_mask="%xmm7"; 970 971 $code.=<<___; 972 .globl aesni_ccm64_encrypt_blocks 973 .type aesni_ccm64_encrypt_blocks,\@function,6 974 .align 16 975 aesni_ccm64_encrypt_blocks: 976 ___ 977 $code.=<<___ if ($win64); 978 lea -0x58(%rsp),%rsp 979 movaps %xmm6,(%rsp) # $iv 980 movaps %xmm7,0x10(%rsp) # $bswap_mask 981 movaps %xmm8,0x20(%rsp) # $in0 982 movaps %xmm9,0x30(%rsp) # $increment 983 .Lccm64_enc_body: 984 ___ 985 $code.=<<___; 986 mov 240($key),$rounds # key->rounds 987 movdqu ($ivp),$iv 988 movdqa .Lincrement64(%rip),$increment 989 movdqa .Lbswap_mask(%rip),$bswap_mask 990 991 shl \$4,$rounds 992 mov \$16,$rnds_ 993 lea 0($key),$key_ 994 movdqu ($cmac),$inout1 995 movdqa $iv,$inout0 996 lea 32($key,$rounds),$key # end of key schedule 997 pshufb $bswap_mask,$iv 998 sub %rax,%r10 # twisted $rounds 999 jmp .Lccm64_enc_outer 1000 .align 16 1001 .Lccm64_enc_outer: 1002 $movkey ($key_),$rndkey0 1003 mov %r10,%rax 1004 movups ($inp),$in0 # load inp 1005 1006 xorps $rndkey0,$inout0 # counter 1007 $movkey 16($key_),$rndkey1 1008 xorps $in0,$rndkey0 1009 xorps $rndkey0,$inout1 # cmac^=inp 1010 $movkey 32($key_),$rndkey0 1011 1012 .Lccm64_enc2_loop: 1013 aesenc $rndkey1,$inout0 1014 aesenc $rndkey1,$inout1 1015 $movkey ($key,%rax),$rndkey1 1016 add \$32,%rax 1017 aesenc $rndkey0,$inout0 1018 aesenc $rndkey0,$inout1 1019 $movkey -16($key,%rax),$rndkey0 1020 jnz .Lccm64_enc2_loop 1021 aesenc $rndkey1,$inout0 1022 aesenc $rndkey1,$inout1 1023 paddq $increment,$iv 1024 dec $len # $len-- ($len is in blocks) 1025 aesenclast $rndkey0,$inout0 1026 aesenclast $rndkey0,$inout1 1027 1028 lea 16($inp),$inp 1029 xorps $inout0,$in0 # inp ^= E(iv) 1030 movdqa $iv,$inout0 1031 movups $in0,($out) # save output 1032 pshufb $bswap_mask,$inout0 1033 lea 16($out),$out # $out+=16 1034 jnz .Lccm64_enc_outer # loop if ($len!=0) 1035 1036 pxor $rndkey0,$rndkey0 # clear register bank 1037 pxor $rndkey1,$rndkey1 1038 pxor $inout0,$inout0 1039 movups $inout1,($cmac) # store resulting mac 1040 pxor $inout1,$inout1 1041 pxor $in0,$in0 1042 pxor $iv,$iv 1043 ___ 1044 $code.=<<___ if ($win64); 1045 movaps (%rsp),%xmm6 1046 movaps %xmm0,(%rsp) # clear stack 1047 movaps 0x10(%rsp),%xmm7 1048 movaps %xmm0,0x10(%rsp) 1049 movaps 0x20(%rsp),%xmm8 1050 movaps %xmm0,0x20(%rsp) 1051 movaps 0x30(%rsp),%xmm9 1052 movaps %xmm0,0x30(%rsp) 1053 lea 0x58(%rsp),%rsp 1054 .Lccm64_enc_ret: 1055 ___ 1056 $code.=<<___; 1057 ret 1058 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1059 ___ 1060 ###################################################################### 1061 $code.=<<___; 1062 .globl aesni_ccm64_decrypt_blocks 1063 .type aesni_ccm64_decrypt_blocks,\@function,6 1064 .align 16 1065 aesni_ccm64_decrypt_blocks: 1066 ___ 1067 $code.=<<___ if ($win64); 1068 lea -0x58(%rsp),%rsp 1069 movaps %xmm6,(%rsp) # $iv 1070 movaps %xmm7,0x10(%rsp) # $bswap_mask 1071 movaps %xmm8,0x20(%rsp) # $in8 1072 movaps %xmm9,0x30(%rsp) # $increment 1073 .Lccm64_dec_body: 1074 ___ 1075 $code.=<<___; 1076 mov 240($key),$rounds # key->rounds 1077 movups ($ivp),$iv 1078 movdqu ($cmac),$inout1 1079 movdqa .Lincrement64(%rip),$increment 1080 movdqa .Lbswap_mask(%rip),$bswap_mask 1081 1082 movaps $iv,$inout0 1083 mov $rounds,$rnds_ 1084 mov $key,$key_ 1085 pshufb $bswap_mask,$iv 1086 ___ 1087 &aesni_generate1("enc",$key,$rounds); 1088 $code.=<<___; 1089 shl \$4,$rnds_ 1090 mov \$16,$rounds 1091 movups ($inp),$in0 # load inp 1092 paddq $increment,$iv 1093 lea 16($inp),$inp # $inp+=16 1094 sub %r10,%rax # twisted $rounds 1095 lea 32($key_,$rnds_),$key # end of key schedule 1096 mov %rax,%r10 1097 jmp .Lccm64_dec_outer 1098 .align 16 1099 .Lccm64_dec_outer: 1100 xorps $inout0,$in0 # inp ^= E(iv) 1101 movdqa $iv,$inout0 1102 movups $in0,($out) # save output 1103 lea 16($out),$out # $out+=16 1104 pshufb $bswap_mask,$inout0 1105 1106 sub \$1,$len # $len-- ($len is in blocks) 1107 jz .Lccm64_dec_break # if ($len==0) break 1108 1109 $movkey ($key_),$rndkey0 1110 mov %r10,%rax 1111 $movkey 16($key_),$rndkey1 1112 xorps $rndkey0,$in0 1113 xorps $rndkey0,$inout0 1114 xorps $in0,$inout1 # cmac^=out 1115 $movkey 32($key_),$rndkey0 1116 jmp .Lccm64_dec2_loop 1117 .align 16 1118 .Lccm64_dec2_loop: 1119 aesenc $rndkey1,$inout0 1120 aesenc $rndkey1,$inout1 1121 $movkey ($key,%rax),$rndkey1 1122 add \$32,%rax 1123 aesenc $rndkey0,$inout0 1124 aesenc $rndkey0,$inout1 1125 $movkey -16($key,%rax),$rndkey0 1126 jnz .Lccm64_dec2_loop 1127 movups ($inp),$in0 # load input 1128 paddq $increment,$iv 1129 aesenc $rndkey1,$inout0 1130 aesenc $rndkey1,$inout1 1131 aesenclast $rndkey0,$inout0 1132 aesenclast $rndkey0,$inout1 1133 lea 16($inp),$inp # $inp+=16 1134 jmp .Lccm64_dec_outer 1135 1136 .align 16 1137 .Lccm64_dec_break: 1138 #xorps $in0,$inout1 # cmac^=out 1139 mov 240($key_),$rounds 1140 ___ 1141 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1142 $code.=<<___; 1143 pxor $rndkey0,$rndkey0 # clear register bank 1144 pxor $rndkey1,$rndkey1 1145 pxor $inout0,$inout0 1146 movups $inout1,($cmac) # store resulting mac 1147 pxor $inout1,$inout1 1148 pxor $in0,$in0 1149 pxor $iv,$iv 1150 ___ 1151 $code.=<<___ if ($win64); 1152 movaps (%rsp),%xmm6 1153 movaps %xmm0,(%rsp) # clear stack 1154 movaps 0x10(%rsp),%xmm7 1155 movaps %xmm0,0x10(%rsp) 1156 movaps 0x20(%rsp),%xmm8 1157 movaps %xmm0,0x20(%rsp) 1158 movaps 0x30(%rsp),%xmm9 1159 movaps %xmm0,0x30(%rsp) 1160 lea 0x58(%rsp),%rsp 1161 .Lccm64_dec_ret: 1162 ___ 1163 $code.=<<___; 1164 ret 1165 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1166 ___ 1167 } 1169 ###################################################################### 1170 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1171 # size_t blocks, const AES_KEY *key, 1172 # const char *ivec); 1173 # 1174 # Handles only complete blocks, operates on 32-bit counter and 1175 # does not update *ivec! (see crypto/modes/ctr128.c for details) 1176 # 1177 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1178 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1179 # Keywords are full unroll and modulo-schedule counter calculations 1180 # with zero-round key xor. 1181 { 1182 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1183 my ($key0,$ctr)=("%ebp","${ivp}d"); 1184 my $frame_size = 0x80 + ($win64?160:0); 1185 1186 $code.=<<___; 1187 .globl aesni_ctr32_encrypt_blocks 1188 .type aesni_ctr32_encrypt_blocks,\@function,5 1189 .align 16 1190 aesni_ctr32_encrypt_blocks: 1191 cmp \$1,$len 1192 jne .Lctr32_bulk 1193 1194 # handle single block without allocating stack frame, 1195 # useful when handling edges 1196 movups ($ivp),$inout0 1197 movups ($inp),$inout1 1198 mov 240($key),%edx # key->rounds 1199 ___ 1200 &aesni_generate1("enc",$key,"%edx"); 1201 $code.=<<___; 1202 pxor $rndkey0,$rndkey0 # clear register bank 1203 pxor $rndkey1,$rndkey1 1204 xorps $inout1,$inout0 1205 pxor $inout1,$inout1 1206 movups $inout0,($out) 1207 xorps $inout0,$inout0 1208 jmp .Lctr32_epilogue 1209 1210 .align 16 1211 .Lctr32_bulk: 1212 lea (%rsp),$key_ # use $key_ as frame pointer 1213 push %rbp 1214 sub \$$frame_size,%rsp 1215 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1216 ___ 1217 $code.=<<___ if ($win64); 1218 movaps %xmm6,-0xa8($key_) # offload everything 1219 movaps %xmm7,-0x98($key_) 1220 movaps %xmm8,-0x88($key_) 1221 movaps %xmm9,-0x78($key_) 1222 movaps %xmm10,-0x68($key_) 1223 movaps %xmm11,-0x58($key_) 1224 movaps %xmm12,-0x48($key_) 1225 movaps %xmm13,-0x38($key_) 1226 movaps %xmm14,-0x28($key_) 1227 movaps %xmm15,-0x18($key_) 1228 .Lctr32_body: 1229 ___ 1230 $code.=<<___; 1231 1232 # 8 16-byte words on top of stack are counter values 1233 # xor-ed with zero-round key 1234 1235 movdqu ($ivp),$inout0 1236 movdqu ($key),$rndkey0 1237 mov 12($ivp),$ctr # counter LSB 1238 pxor $rndkey0,$inout0 1239 mov 12($key),$key0 # 0-round key LSB 1240 movdqa $inout0,0x00(%rsp) # populate counter block 1241 bswap $ctr 1242 movdqa $inout0,$inout1 1243 movdqa $inout0,$inout2 1244 movdqa $inout0,$inout3 1245 movdqa $inout0,0x40(%rsp) 1246 movdqa $inout0,0x50(%rsp) 1247 movdqa $inout0,0x60(%rsp) 1248 mov %rdx,%r10 # about to borrow %rdx 1249 movdqa $inout0,0x70(%rsp) 1250 1251 lea 1($ctr),%rax 1252 lea 2($ctr),%rdx 1253 bswap %eax 1254 bswap %edx 1255 xor $key0,%eax 1256 xor $key0,%edx 1257 pinsrd \$3,%eax,$inout1 1258 lea 3($ctr),%rax 1259 movdqa $inout1,0x10(%rsp) 1260 pinsrd \$3,%edx,$inout2 1261 bswap %eax 1262 mov %r10,%rdx # restore %rdx 1263 lea 4($ctr),%r10 1264 movdqa $inout2,0x20(%rsp) 1265 xor $key0,%eax 1266 bswap %r10d 1267 pinsrd \$3,%eax,$inout3 1268 xor $key0,%r10d 1269 movdqa $inout3,0x30(%rsp) 1270 lea 5($ctr),%r9 1271 mov %r10d,0x40+12(%rsp) 1272 bswap %r9d 1273 lea 6($ctr),%r10 1274 mov 240($key),$rounds # key->rounds 1275 xor $key0,%r9d 1276 bswap %r10d 1277 mov %r9d,0x50+12(%rsp) 1278 xor $key0,%r10d 1279 lea 7($ctr),%r9 1280 mov %r10d,0x60+12(%rsp) 1281 bswap %r9d 1282 leaq OPENSSL_ia32cap_P(%rip),%r10 1283 mov 4(%r10),%r10d 1284 xor $key0,%r9d 1285 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1286 mov %r9d,0x70+12(%rsp) 1287 1288 $movkey 0x10($key),$rndkey1 1289 1290 movdqa 0x40(%rsp),$inout4 1291 movdqa 0x50(%rsp),$inout5 1292 1293 cmp \$8,$len # $len is in blocks 1294 jb .Lctr32_tail # short input if ($len<8) 1295 1296 sub \$6,$len # $len is biased by -6 1297 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1298 je .Lctr32_6x # [which denotes Atom Silvermont] 1299 1300 lea 0x80($key),$key # size optimization 1301 sub \$2,$len # $len is biased by -8 1302 jmp .Lctr32_loop8 1303 1304 .align 16 1305 .Lctr32_6x: 1306 shl \$4,$rounds 1307 mov \$48,$rnds_ 1308 bswap $key0 1309 lea 32($key,$rounds),$key # end of key schedule 1310 sub %rax,%r10 # twisted $rounds 1311 jmp .Lctr32_loop6 1312 1313 .align 16 1314 .Lctr32_loop6: 1315 add \$6,$ctr # next counter value 1316 $movkey -48($key,$rnds_),$rndkey0 1317 aesenc $rndkey1,$inout0 1318 mov $ctr,%eax 1319 xor $key0,%eax 1320 aesenc $rndkey1,$inout1 1321 movbe %eax,`0x00+12`(%rsp) # store next counter value 1322 lea 1($ctr),%eax 1323 aesenc $rndkey1,$inout2 1324 xor $key0,%eax 1325 movbe %eax,`0x10+12`(%rsp) 1326 aesenc $rndkey1,$inout3 1327 lea 2($ctr),%eax 1328 xor $key0,%eax 1329 aesenc $rndkey1,$inout4 1330 movbe %eax,`0x20+12`(%rsp) 1331 lea 3($ctr),%eax 1332 aesenc $rndkey1,$inout5 1333 $movkey -32($key,$rnds_),$rndkey1 1334 xor $key0,%eax 1335 1336 aesenc $rndkey0,$inout0 1337 movbe %eax,`0x30+12`(%rsp) 1338 lea 4($ctr),%eax 1339 aesenc $rndkey0,$inout1 1340 xor $key0,%eax 1341 movbe %eax,`0x40+12`(%rsp) 1342 aesenc $rndkey0,$inout2 1343 lea 5($ctr),%eax 1344 xor $key0,%eax 1345 aesenc $rndkey0,$inout3 1346 movbe %eax,`0x50+12`(%rsp) 1347 mov %r10,%rax # mov $rnds_,$rounds 1348 aesenc $rndkey0,$inout4 1349 aesenc $rndkey0,$inout5 1350 $movkey -16($key,$rnds_),$rndkey0 1351 1352 call .Lenc_loop6 1353 1354 movdqu ($inp),$inout6 # load 6 input blocks 1355 movdqu 0x10($inp),$inout7 1356 movdqu 0x20($inp),$in0 1357 movdqu 0x30($inp),$in1 1358 movdqu 0x40($inp),$in2 1359 movdqu 0x50($inp),$in3 1360 lea 0x60($inp),$inp # $inp+=6*16 1361 $movkey -64($key,$rnds_),$rndkey1 1362 pxor $inout0,$inout6 # inp^=E(ctr) 1363 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1364 pxor $inout1,$inout7 1365 movaps 0x10(%rsp),$inout1 1366 pxor $inout2,$in0 1367 movaps 0x20(%rsp),$inout2 1368 pxor $inout3,$in1 1369 movaps 0x30(%rsp),$inout3 1370 pxor $inout4,$in2 1371 movaps 0x40(%rsp),$inout4 1372 pxor $inout5,$in3 1373 movaps 0x50(%rsp),$inout5 1374 movdqu $inout6,($out) # store 6 output blocks 1375 movdqu $inout7,0x10($out) 1376 movdqu $in0,0x20($out) 1377 movdqu $in1,0x30($out) 1378 movdqu $in2,0x40($out) 1379 movdqu $in3,0x50($out) 1380 lea 0x60($out),$out # $out+=6*16 1381 1382 sub \$6,$len 1383 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1384 1385 add \$6,$len # restore real remaining $len 1386 jz .Lctr32_done # done if ($len==0) 1387 1388 lea -48($rnds_),$rounds 1389 lea -80($key,$rnds_),$key # restore $key 1390 neg $rounds 1391 shr \$4,$rounds # restore $rounds 1392 jmp .Lctr32_tail 1393 1394 .align 32 1395 .Lctr32_loop8: 1396 add \$8,$ctr # next counter value 1397 movdqa 0x60(%rsp),$inout6 1398 aesenc $rndkey1,$inout0 1399 mov $ctr,%r9d 1400 movdqa 0x70(%rsp),$inout7 1401 aesenc $rndkey1,$inout1 1402 bswap %r9d 1403 $movkey 0x20-0x80($key),$rndkey0 1404 aesenc $rndkey1,$inout2 1405 xor $key0,%r9d 1406 nop 1407 aesenc $rndkey1,$inout3 1408 mov %r9d,0x00+12(%rsp) # store next counter value 1409 lea 1($ctr),%r9 1410 aesenc $rndkey1,$inout4 1411 aesenc $rndkey1,$inout5 1412 aesenc $rndkey1,$inout6 1413 aesenc $rndkey1,$inout7 1414 $movkey 0x30-0x80($key),$rndkey1 1415 ___ 1416 for($i=2;$i<8;$i++) { 1417 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1418 $code.=<<___; 1419 bswap %r9d 1420 aesenc $rndkeyx,$inout0 1421 aesenc $rndkeyx,$inout1 1422 xor $key0,%r9d 1423 .byte 0x66,0x90 1424 aesenc $rndkeyx,$inout2 1425 aesenc $rndkeyx,$inout3 1426 mov %r9d,`0x10*($i-1)`+12(%rsp) 1427 lea $i($ctr),%r9 1428 aesenc $rndkeyx,$inout4 1429 aesenc $rndkeyx,$inout5 1430 aesenc $rndkeyx,$inout6 1431 aesenc $rndkeyx,$inout7 1432 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1433 ___ 1434 } 1435 $code.=<<___; 1436 bswap %r9d 1437 aesenc $rndkey0,$inout0 1438 aesenc $rndkey0,$inout1 1439 aesenc $rndkey0,$inout2 1440 xor $key0,%r9d 1441 movdqu 0x00($inp),$in0 # start loading input 1442 aesenc $rndkey0,$inout3 1443 mov %r9d,0x70+12(%rsp) 1444 cmp \$11,$rounds 1445 aesenc $rndkey0,$inout4 1446 aesenc $rndkey0,$inout5 1447 aesenc $rndkey0,$inout6 1448 aesenc $rndkey0,$inout7 1449 $movkey 0xa0-0x80($key),$rndkey0 1450 1451 jb .Lctr32_enc_done 1452 1453 aesenc $rndkey1,$inout0 1454 aesenc $rndkey1,$inout1 1455 aesenc $rndkey1,$inout2 1456 aesenc $rndkey1,$inout3 1457 aesenc $rndkey1,$inout4 1458 aesenc $rndkey1,$inout5 1459 aesenc $rndkey1,$inout6 1460 aesenc $rndkey1,$inout7 1461 $movkey 0xb0-0x80($key),$rndkey1 1462 1463 aesenc $rndkey0,$inout0 1464 aesenc $rndkey0,$inout1 1465 aesenc $rndkey0,$inout2 1466 aesenc $rndkey0,$inout3 1467 aesenc $rndkey0,$inout4 1468 aesenc $rndkey0,$inout5 1469 aesenc $rndkey0,$inout6 1470 aesenc $rndkey0,$inout7 1471 $movkey 0xc0-0x80($key),$rndkey0 1472 je .Lctr32_enc_done 1473 1474 aesenc $rndkey1,$inout0 1475 aesenc $rndkey1,$inout1 1476 aesenc $rndkey1,$inout2 1477 aesenc $rndkey1,$inout3 1478 aesenc $rndkey1,$inout4 1479 aesenc $rndkey1,$inout5 1480 aesenc $rndkey1,$inout6 1481 aesenc $rndkey1,$inout7 1482 $movkey 0xd0-0x80($key),$rndkey1 1483 1484 aesenc $rndkey0,$inout0 1485 aesenc $rndkey0,$inout1 1486 aesenc $rndkey0,$inout2 1487 aesenc $rndkey0,$inout3 1488 aesenc $rndkey0,$inout4 1489 aesenc $rndkey0,$inout5 1490 aesenc $rndkey0,$inout6 1491 aesenc $rndkey0,$inout7 1492 $movkey 0xe0-0x80($key),$rndkey0 1493 jmp .Lctr32_enc_done 1494 1495 .align 16 1496 .Lctr32_enc_done: 1497 movdqu 0x10($inp),$in1 1498 pxor $rndkey0,$in0 # input^=round[last] 1499 movdqu 0x20($inp),$in2 1500 pxor $rndkey0,$in1 1501 movdqu 0x30($inp),$in3 1502 pxor $rndkey0,$in2 1503 movdqu 0x40($inp),$in4 1504 pxor $rndkey0,$in3 1505 movdqu 0x50($inp),$in5 1506 pxor $rndkey0,$in4 1507 pxor $rndkey0,$in5 1508 aesenc $rndkey1,$inout0 1509 aesenc $rndkey1,$inout1 1510 aesenc $rndkey1,$inout2 1511 aesenc $rndkey1,$inout3 1512 aesenc $rndkey1,$inout4 1513 aesenc $rndkey1,$inout5 1514 aesenc $rndkey1,$inout6 1515 aesenc $rndkey1,$inout7 1516 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1517 lea 0x80($inp),$inp # $inp+=8*16 1518 1519 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1520 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1521 movdqu 0x70-0x80($inp),$in0 1522 aesenclast $in1,$inout1 1523 pxor $rndkey0,$in0 1524 movdqa 0x00(%rsp),$in1 # load next counter block 1525 aesenclast $in2,$inout2 1526 aesenclast $in3,$inout3 1527 movdqa 0x10(%rsp),$in2 1528 movdqa 0x20(%rsp),$in3 1529 aesenclast $in4,$inout4 1530 aesenclast $in5,$inout5 1531 movdqa 0x30(%rsp),$in4 1532 movdqa 0x40(%rsp),$in5 1533 aesenclast $rndkey1,$inout6 1534 movdqa 0x50(%rsp),$rndkey0 1535 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1536 aesenclast $in0,$inout7 1537 1538 movups $inout0,($out) # store 8 output blocks 1539 movdqa $in1,$inout0 1540 movups $inout1,0x10($out) 1541 movdqa $in2,$inout1 1542 movups $inout2,0x20($out) 1543 movdqa $in3,$inout2 1544 movups $inout3,0x30($out) 1545 movdqa $in4,$inout3 1546 movups $inout4,0x40($out) 1547 movdqa $in5,$inout4 1548 movups $inout5,0x50($out) 1549 movdqa $rndkey0,$inout5 1550 movups $inout6,0x60($out) 1551 movups $inout7,0x70($out) 1552 lea 0x80($out),$out # $out+=8*16 1553 1554 sub \$8,$len 1555 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1556 1557 add \$8,$len # restore real remainig $len 1558 jz .Lctr32_done # done if ($len==0) 1559 lea -0x80($key),$key 1560 1561 .Lctr32_tail: 1562 # note that at this point $inout0..5 are populated with 1563 # counter values xor-ed with 0-round key 1564 lea 16($key),$key 1565 cmp \$4,$len 1566 jb .Lctr32_loop3 1567 je .Lctr32_loop4 1568 1569 # if ($len>4) compute 7 E(counter) 1570 shl \$4,$rounds 1571 movdqa 0x60(%rsp),$inout6 1572 pxor $inout7,$inout7 1573 1574 $movkey 16($key),$rndkey0 1575 aesenc $rndkey1,$inout0 1576 aesenc $rndkey1,$inout1 1577 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1578 neg %rax 1579 aesenc $rndkey1,$inout2 1580 add \$16,%rax # prepare for .Lenc_loop8_enter 1581 movups ($inp),$in0 1582 aesenc $rndkey1,$inout3 1583 aesenc $rndkey1,$inout4 1584 movups 0x10($inp),$in1 # pre-load input 1585 movups 0x20($inp),$in2 1586 aesenc $rndkey1,$inout5 1587 aesenc $rndkey1,$inout6 1588 1589 call .Lenc_loop8_enter 1590 1591 movdqu 0x30($inp),$in3 1592 pxor $in0,$inout0 1593 movdqu 0x40($inp),$in0 1594 pxor $in1,$inout1 1595 movdqu $inout0,($out) # store output 1596 pxor $in2,$inout2 1597 movdqu $inout1,0x10($out) 1598 pxor $in3,$inout3 1599 movdqu $inout2,0x20($out) 1600 pxor $in0,$inout4 1601 movdqu $inout3,0x30($out) 1602 movdqu $inout4,0x40($out) 1603 cmp \$6,$len 1604 jb .Lctr32_done # $len was 5, stop store 1605 1606 movups 0x50($inp),$in1 1607 xorps $in1,$inout5 1608 movups $inout5,0x50($out) 1609 je .Lctr32_done # $len was 6, stop store 1610 1611 movups 0x60($inp),$in2 1612 xorps $in2,$inout6 1613 movups $inout6,0x60($out) 1614 jmp .Lctr32_done # $len was 7, stop store 1615 1616 .align 32 1617 .Lctr32_loop4: 1618 aesenc $rndkey1,$inout0 1619 lea 16($key),$key 1620 dec $rounds 1621 aesenc $rndkey1,$inout1 1622 aesenc $rndkey1,$inout2 1623 aesenc $rndkey1,$inout3 1624 $movkey ($key),$rndkey1 1625 jnz .Lctr32_loop4 1626 aesenclast $rndkey1,$inout0 1627 aesenclast $rndkey1,$inout1 1628 movups ($inp),$in0 # load input 1629 movups 0x10($inp),$in1 1630 aesenclast $rndkey1,$inout2 1631 aesenclast $rndkey1,$inout3 1632 movups 0x20($inp),$in2 1633 movups 0x30($inp),$in3 1634 1635 xorps $in0,$inout0 1636 movups $inout0,($out) # store output 1637 xorps $in1,$inout1 1638 movups $inout1,0x10($out) 1639 pxor $in2,$inout2 1640 movdqu $inout2,0x20($out) 1641 pxor $in3,$inout3 1642 movdqu $inout3,0x30($out) 1643 jmp .Lctr32_done # $len was 4, stop store 1644 1645 .align 32 1646 .Lctr32_loop3: 1647 aesenc $rndkey1,$inout0 1648 lea 16($key),$key 1649 dec $rounds 1650 aesenc $rndkey1,$inout1 1651 aesenc $rndkey1,$inout2 1652 $movkey ($key),$rndkey1 1653 jnz .Lctr32_loop3 1654 aesenclast $rndkey1,$inout0 1655 aesenclast $rndkey1,$inout1 1656 aesenclast $rndkey1,$inout2 1657 1658 movups ($inp),$in0 # load input 1659 xorps $in0,$inout0 1660 movups $inout0,($out) # store output 1661 cmp \$2,$len 1662 jb .Lctr32_done # $len was 1, stop store 1663 1664 movups 0x10($inp),$in1 1665 xorps $in1,$inout1 1666 movups $inout1,0x10($out) 1667 je .Lctr32_done # $len was 2, stop store 1668 1669 movups 0x20($inp),$in2 1670 xorps $in2,$inout2 1671 movups $inout2,0x20($out) # $len was 3, stop store 1672 1673 .Lctr32_done: 1674 xorps %xmm0,%xmm0 # clear regiser bank 1675 xor $key0,$key0 1676 pxor %xmm1,%xmm1 1677 pxor %xmm2,%xmm2 1678 pxor %xmm3,%xmm3 1679 pxor %xmm4,%xmm4 1680 pxor %xmm5,%xmm5 1681 ___ 1682 $code.=<<___ if (!$win64); 1683 pxor %xmm6,%xmm6 1684 pxor %xmm7,%xmm7 1685 movaps %xmm0,0x00(%rsp) # clear stack 1686 pxor %xmm8,%xmm8 1687 movaps %xmm0,0x10(%rsp) 1688 pxor %xmm9,%xmm9 1689 movaps %xmm0,0x20(%rsp) 1690 pxor %xmm10,%xmm10 1691 movaps %xmm0,0x30(%rsp) 1692 pxor %xmm11,%xmm11 1693 movaps %xmm0,0x40(%rsp) 1694 pxor %xmm12,%xmm12 1695 movaps %xmm0,0x50(%rsp) 1696 pxor %xmm13,%xmm13 1697 movaps %xmm0,0x60(%rsp) 1698 pxor %xmm14,%xmm14 1699 movaps %xmm0,0x70(%rsp) 1700 pxor %xmm15,%xmm15 1701 ___ 1702 $code.=<<___ if ($win64); 1703 movaps -0xa8($key_),%xmm6 1704 movaps %xmm0,-0xa8($key_) # clear stack 1705 movaps -0x98($key_),%xmm7 1706 movaps %xmm0,-0x98($key_) 1707 movaps -0x88($key_),%xmm8 1708 movaps %xmm0,-0x88($key_) 1709 movaps -0x78($key_),%xmm9 1710 movaps %xmm0,-0x78($key_) 1711 movaps -0x68($key_),%xmm10 1712 movaps %xmm0,-0x68($key_) 1713 movaps -0x58($key_),%xmm11 1714 movaps %xmm0,-0x58($key_) 1715 movaps -0x48($key_),%xmm12 1716 movaps %xmm0,-0x48($key_) 1717 movaps -0x38($key_),%xmm13 1718 movaps %xmm0,-0x38($key_) 1719 movaps -0x28($key_),%xmm14 1720 movaps %xmm0,-0x28($key_) 1721 movaps -0x18($key_),%xmm15 1722 movaps %xmm0,-0x18($key_) 1723 movaps %xmm0,0x00(%rsp) 1724 movaps %xmm0,0x10(%rsp) 1725 movaps %xmm0,0x20(%rsp) 1726 movaps %xmm0,0x30(%rsp) 1727 movaps %xmm0,0x40(%rsp) 1728 movaps %xmm0,0x50(%rsp) 1729 movaps %xmm0,0x60(%rsp) 1730 movaps %xmm0,0x70(%rsp) 1731 ___ 1732 $code.=<<___; 1733 mov -8($key_),%rbp 1734 lea ($key_),%rsp 1735 .Lctr32_epilogue: 1736 ret 1737 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1738 ___ 1739 } 1740 1742 ###################################################################### 1743 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1744 # const AES_KEY *key1, const AES_KEY *key2 1745 # const unsigned char iv[16]); 1746 # 1747 { 1748 my @tweak=map("%xmm$_",(10..15)); 1749 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1750 my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1751 my $frame_size = 0x70 + ($win64?160:0); 1752 my $key_ = "%rbp"; # override so that we can use %r11 as FP 1753 1754 $code.=<<___; 1755 .globl aesni_xts_encrypt 1756 .type aesni_xts_encrypt,\@function,6 1757 .align 16 1758 aesni_xts_encrypt: 1759 lea (%rsp),%r11 # frame pointer 1760 push %rbp 1761 sub \$$frame_size,%rsp 1762 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1763 ___ 1764 $code.=<<___ if ($win64); 1765 movaps %xmm6,-0xa8(%r11) # offload everything 1766 movaps %xmm7,-0x98(%r11) 1767 movaps %xmm8,-0x88(%r11) 1768 movaps %xmm9,-0x78(%r11) 1769 movaps %xmm10,-0x68(%r11) 1770 movaps %xmm11,-0x58(%r11) 1771 movaps %xmm12,-0x48(%r11) 1772 movaps %xmm13,-0x38(%r11) 1773 movaps %xmm14,-0x28(%r11) 1774 movaps %xmm15,-0x18(%r11) 1775 .Lxts_enc_body: 1776 ___ 1777 $code.=<<___; 1778 movups ($ivp),$inout0 # load clear-text tweak 1779 mov 240(%r8),$rounds # key2->rounds 1780 mov 240($key),$rnds_ # key1->rounds 1781 ___ 1782 # generate the tweak 1783 &aesni_generate1("enc",$key2,$rounds,$inout0); 1784 $code.=<<___; 1785 $movkey ($key),$rndkey0 # zero round key 1786 mov $key,$key_ # backup $key 1787 mov $rnds_,$rounds # backup $rounds 1788 shl \$4,$rnds_ 1789 mov $len,$len_ # backup $len 1790 and \$-16,$len 1791 1792 $movkey 16($key,$rnds_),$rndkey1 # last round key 1793 1794 movdqa .Lxts_magic(%rip),$twmask 1795 movdqa $inout0,@tweak[5] 1796 pshufd \$0x5f,$inout0,$twres 1797 pxor $rndkey0,$rndkey1 1798 ___ 1799 # alternative tweak calculation algorithm is based on suggestions 1800 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1801 # and should help in the future... 1802 for ($i=0;$i<4;$i++) { 1803 $code.=<<___; 1804 movdqa $twres,$twtmp 1805 paddd $twres,$twres 1806 movdqa @tweak[5],@tweak[$i] 1807 psrad \$31,$twtmp # broadcast upper bits 1808 paddq @tweak[5],@tweak[5] 1809 pand $twmask,$twtmp 1810 pxor $rndkey0,@tweak[$i] 1811 pxor $twtmp,@tweak[5] 1812 ___ 1813 } 1814 $code.=<<___; 1815 movdqa @tweak[5],@tweak[4] 1816 psrad \$31,$twres 1817 paddq @tweak[5],@tweak[5] 1818 pand $twmask,$twres 1819 pxor $rndkey0,@tweak[4] 1820 pxor $twres,@tweak[5] 1821 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1822 1823 sub \$16*6,$len 1824 jc .Lxts_enc_short # if $len-=6*16 borrowed 1825 1826 mov \$16+96,$rounds 1827 lea 32($key_,$rnds_),$key # end of key schedule 1828 sub %r10,%rax # twisted $rounds 1829 $movkey 16($key_),$rndkey1 1830 mov %rax,%r10 # backup twisted $rounds 1831 lea .Lxts_magic(%rip),%r8 1832 jmp .Lxts_enc_grandloop 1833 1834 .align 32 1835 .Lxts_enc_grandloop: 1836 movdqu `16*0`($inp),$inout0 # load input 1837 movdqa $rndkey0,$twmask 1838 movdqu `16*1`($inp),$inout1 1839 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1840 movdqu `16*2`($inp),$inout2 1841 pxor @tweak[1],$inout1 1842 aesenc $rndkey1,$inout0 1843 movdqu `16*3`($inp),$inout3 1844 pxor @tweak[2],$inout2 1845 aesenc $rndkey1,$inout1 1846 movdqu `16*4`($inp),$inout4 1847 pxor @tweak[3],$inout3 1848 aesenc $rndkey1,$inout2 1849 movdqu `16*5`($inp),$inout5 1850 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1851 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1852 pxor @tweak[4],$inout4 1853 aesenc $rndkey1,$inout3 1854 $movkey 32($key_),$rndkey0 1855 lea `16*6`($inp),$inp 1856 pxor $twmask,$inout5 1857 1858 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 1859 aesenc $rndkey1,$inout4 1860 pxor $twres,@tweak[1] 1861 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1862 aesenc $rndkey1,$inout5 1863 $movkey 48($key_),$rndkey1 1864 pxor $twres,@tweak[2] 1865 1866 aesenc $rndkey0,$inout0 1867 pxor $twres,@tweak[3] 1868 movdqa @tweak[1],`16*1`(%rsp) 1869 aesenc $rndkey0,$inout1 1870 pxor $twres,@tweak[4] 1871 movdqa @tweak[2],`16*2`(%rsp) 1872 aesenc $rndkey0,$inout2 1873 aesenc $rndkey0,$inout3 1874 pxor $twres,$twmask 1875 movdqa @tweak[4],`16*4`(%rsp) 1876 aesenc $rndkey0,$inout4 1877 aesenc $rndkey0,$inout5 1878 $movkey 64($key_),$rndkey0 1879 movdqa $twmask,`16*5`(%rsp) 1880 pshufd \$0x5f,@tweak[5],$twres 1881 jmp .Lxts_enc_loop6 1882 .align 32 1883 .Lxts_enc_loop6: 1884 aesenc $rndkey1,$inout0 1885 aesenc $rndkey1,$inout1 1886 aesenc $rndkey1,$inout2 1887 aesenc $rndkey1,$inout3 1888 aesenc $rndkey1,$inout4 1889 aesenc $rndkey1,$inout5 1890 $movkey -64($key,%rax),$rndkey1 1891 add \$32,%rax 1892 1893 aesenc $rndkey0,$inout0 1894 aesenc $rndkey0,$inout1 1895 aesenc $rndkey0,$inout2 1896 aesenc $rndkey0,$inout3 1897 aesenc $rndkey0,$inout4 1898 aesenc $rndkey0,$inout5 1899 $movkey -80($key,%rax),$rndkey0 1900 jnz .Lxts_enc_loop6 1901 1902 movdqa (%r8),$twmask # start calculating next tweak 1903 movdqa $twres,$twtmp 1904 paddd $twres,$twres 1905 aesenc $rndkey1,$inout0 1906 paddq @tweak[5],@tweak[5] 1907 psrad \$31,$twtmp 1908 aesenc $rndkey1,$inout1 1909 pand $twmask,$twtmp 1910 $movkey ($key_),@tweak[0] # load round[0] 1911 aesenc $rndkey1,$inout2 1912 aesenc $rndkey1,$inout3 1913 aesenc $rndkey1,$inout4 1914 pxor $twtmp,@tweak[5] 1915 movaps @tweak[0],@tweak[1] # copy round[0] 1916 aesenc $rndkey1,$inout5 1917 $movkey -64($key),$rndkey1 1918 1919 movdqa $twres,$twtmp 1920 aesenc $rndkey0,$inout0 1921 paddd $twres,$twres 1922 pxor @tweak[5],@tweak[0] 1923 aesenc $rndkey0,$inout1 1924 psrad \$31,$twtmp 1925 paddq @tweak[5],@tweak[5] 1926 aesenc $rndkey0,$inout2 1927 aesenc $rndkey0,$inout3 1928 pand $twmask,$twtmp 1929 movaps @tweak[1],@tweak[2] 1930 aesenc $rndkey0,$inout4 1931 pxor $twtmp,@tweak[5] 1932 movdqa $twres,$twtmp 1933 aesenc $rndkey0,$inout5 1934 $movkey -48($key),$rndkey0 1935 1936 paddd $twres,$twres 1937 aesenc $rndkey1,$inout0 1938 pxor @tweak[5],@tweak[1] 1939 psrad \$31,$twtmp 1940 aesenc $rndkey1,$inout1 1941 paddq @tweak[5],@tweak[5] 1942 pand $twmask,$twtmp 1943 aesenc $rndkey1,$inout2 1944 aesenc $rndkey1,$inout3 1945 movdqa @tweak[3],`16*3`(%rsp) 1946 pxor $twtmp,@tweak[5] 1947 aesenc $rndkey1,$inout4 1948 movaps @tweak[2],@tweak[3] 1949 movdqa $twres,$twtmp 1950 aesenc $rndkey1,$inout5 1951 $movkey -32($key),$rndkey1 1952 1953 paddd $twres,$twres 1954 aesenc $rndkey0,$inout0 1955 pxor @tweak[5],@tweak[2] 1956 psrad \$31,$twtmp 1957 aesenc $rndkey0,$inout1 1958 paddq @tweak[5],@tweak[5] 1959 pand $twmask,$twtmp 1960 aesenc $rndkey0,$inout2 1961 aesenc $rndkey0,$inout3 1962 aesenc $rndkey0,$inout4 1963 pxor $twtmp,@tweak[5] 1964 movaps @tweak[3],@tweak[4] 1965 aesenc $rndkey0,$inout5 1966 1967 movdqa $twres,$rndkey0 1968 paddd $twres,$twres 1969 aesenc $rndkey1,$inout0 1970 pxor @tweak[5],@tweak[3] 1971 psrad \$31,$rndkey0 1972 aesenc $rndkey1,$inout1 1973 paddq @tweak[5],@tweak[5] 1974 pand $twmask,$rndkey0 1975 aesenc $rndkey1,$inout2 1976 aesenc $rndkey1,$inout3 1977 pxor $rndkey0,@tweak[5] 1978 $movkey ($key_),$rndkey0 1979 aesenc $rndkey1,$inout4 1980 aesenc $rndkey1,$inout5 1981 $movkey 16($key_),$rndkey1 1982 1983 pxor @tweak[5],@tweak[4] 1984 aesenclast `16*0`(%rsp),$inout0 1985 psrad \$31,$twres 1986 paddq @tweak[5],@tweak[5] 1987 aesenclast `16*1`(%rsp),$inout1 1988 aesenclast `16*2`(%rsp),$inout2 1989 pand $twmask,$twres 1990 mov %r10,%rax # restore $rounds 1991 aesenclast `16*3`(%rsp),$inout3 1992 aesenclast `16*4`(%rsp),$inout4 1993 aesenclast `16*5`(%rsp),$inout5 1994 pxor $twres,@tweak[5] 1995 1996 lea `16*6`($out),$out # $out+=6*16 1997 movups $inout0,`-16*6`($out) # store 6 output blocks 1998 movups $inout1,`-16*5`($out) 1999 movups $inout2,`-16*4`($out) 2000 movups $inout3,`-16*3`($out) 2001 movups $inout4,`-16*2`($out) 2002 movups $inout5,`-16*1`($out) 2003 sub \$16*6,$len 2004 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2005 2006 mov \$16+96,$rounds 2007 sub $rnds_,$rounds 2008 mov $key_,$key # restore $key 2009 shr \$4,$rounds # restore original value 2010 2011 .Lxts_enc_short: 2012 # at the point @tweak[0..5] are populated with tweak values 2013 mov $rounds,$rnds_ # backup $rounds 2014 pxor $rndkey0,@tweak[0] 2015 add \$16*6,$len # restore real remaining $len 2016 jz .Lxts_enc_done # done if ($len==0) 2017 2018 pxor $rndkey0,@tweak[1] 2019 cmp \$0x20,$len 2020 jb .Lxts_enc_one # $len is 1*16 2021 pxor $rndkey0,@tweak[2] 2022 je .Lxts_enc_two # $len is 2*16 2023 2024 pxor $rndkey0,@tweak[3] 2025 cmp \$0x40,$len 2026 jb .Lxts_enc_three # $len is 3*16 2027 pxor $rndkey0,@tweak[4] 2028 je .Lxts_enc_four # $len is 4*16 2029 2030 movdqu ($inp),$inout0 # $len is 5*16 2031 movdqu 16*1($inp),$inout1 2032 movdqu 16*2($inp),$inout2 2033 pxor @tweak[0],$inout0 2034 movdqu 16*3($inp),$inout3 2035 pxor @tweak[1],$inout1 2036 movdqu 16*4($inp),$inout4 2037 lea 16*5($inp),$inp # $inp+=5*16 2038 pxor @tweak[2],$inout2 2039 pxor @tweak[3],$inout3 2040 pxor @tweak[4],$inout4 2041 pxor $inout5,$inout5 2042 2043 call _aesni_encrypt6 2044 2045 xorps @tweak[0],$inout0 2046 movdqa @tweak[5],@tweak[0] 2047 xorps @tweak[1],$inout1 2048 xorps @tweak[2],$inout2 2049 movdqu $inout0,($out) # store 5 output blocks 2050 xorps @tweak[3],$inout3 2051 movdqu $inout1,16*1($out) 2052 xorps @tweak[4],$inout4 2053 movdqu $inout2,16*2($out) 2054 movdqu $inout3,16*3($out) 2055 movdqu $inout4,16*4($out) 2056 lea 16*5($out),$out # $out+=5*16 2057 jmp .Lxts_enc_done 2058 2059 .align 16 2060 .Lxts_enc_one: 2061 movups ($inp),$inout0 2062 lea 16*1($inp),$inp # inp+=1*16 2063 xorps @tweak[0],$inout0 2064 ___ 2065 &aesni_generate1("enc",$key,$rounds); 2066 $code.=<<___; 2067 xorps @tweak[0],$inout0 2068 movdqa @tweak[1],@tweak[0] 2069 movups $inout0,($out) # store one output block 2070 lea 16*1($out),$out # $out+=1*16 2071 jmp .Lxts_enc_done 2072 2073 .align 16 2074 .Lxts_enc_two: 2075 movups ($inp),$inout0 2076 movups 16($inp),$inout1 2077 lea 32($inp),$inp # $inp+=2*16 2078 xorps @tweak[0],$inout0 2079 xorps @tweak[1],$inout1 2080 2081 call _aesni_encrypt2 2082 2083 xorps @tweak[0],$inout0 2084 movdqa @tweak[2],@tweak[0] 2085 xorps @tweak[1],$inout1 2086 movups $inout0,($out) # store 2 output blocks 2087 movups $inout1,16*1($out) 2088 lea 16*2($out),$out # $out+=2*16 2089 jmp .Lxts_enc_done 2090 2091 .align 16 2092 .Lxts_enc_three: 2093 movups ($inp),$inout0 2094 movups 16*1($inp),$inout1 2095 movups 16*2($inp),$inout2 2096 lea 16*3($inp),$inp # $inp+=3*16 2097 xorps @tweak[0],$inout0 2098 xorps @tweak[1],$inout1 2099 xorps @tweak[2],$inout2 2100 2101 call _aesni_encrypt3 2102 2103 xorps @tweak[0],$inout0 2104 movdqa @tweak[3],@tweak[0] 2105 xorps @tweak[1],$inout1 2106 xorps @tweak[2],$inout2 2107 movups $inout0,($out) # store 3 output blocks 2108 movups $inout1,16*1($out) 2109 movups $inout2,16*2($out) 2110 lea 16*3($out),$out # $out+=3*16 2111 jmp .Lxts_enc_done 2112 2113 .align 16 2114 .Lxts_enc_four: 2115 movups ($inp),$inout0 2116 movups 16*1($inp),$inout1 2117 movups 16*2($inp),$inout2 2118 xorps @tweak[0],$inout0 2119 movups 16*3($inp),$inout3 2120 lea 16*4($inp),$inp # $inp+=4*16 2121 xorps @tweak[1],$inout1 2122 xorps @tweak[2],$inout2 2123 xorps @tweak[3],$inout3 2124 2125 call _aesni_encrypt4 2126 2127 pxor @tweak[0],$inout0 2128 movdqa @tweak[4],@tweak[0] 2129 pxor @tweak[1],$inout1 2130 pxor @tweak[2],$inout2 2131 movdqu $inout0,($out) # store 4 output blocks 2132 pxor @tweak[3],$inout3 2133 movdqu $inout1,16*1($out) 2134 movdqu $inout2,16*2($out) 2135 movdqu $inout3,16*3($out) 2136 lea 16*4($out),$out # $out+=4*16 2137 jmp .Lxts_enc_done 2138 2139 .align 16 2140 .Lxts_enc_done: 2141 and \$15,$len_ # see if $len%16 is 0 2142 jz .Lxts_enc_ret 2143 mov $len_,$len 2144 2145 .Lxts_enc_steal: 2146 movzb ($inp),%eax # borrow $rounds ... 2147 movzb -16($out),%ecx # ... and $key 2148 lea 1($inp),$inp 2149 mov %al,-16($out) 2150 mov %cl,0($out) 2151 lea 1($out),$out 2152 sub \$1,$len 2153 jnz .Lxts_enc_steal 2154 2155 sub $len_,$out # rewind $out 2156 mov $key_,$key # restore $key 2157 mov $rnds_,$rounds # restore $rounds 2158 2159 movups -16($out),$inout0 2160 xorps @tweak[0],$inout0 2161 ___ 2162 &aesni_generate1("enc",$key,$rounds); 2163 $code.=<<___; 2164 xorps @tweak[0],$inout0 2165 movups $inout0,-16($out) 2166 2167 .Lxts_enc_ret: 2168 xorps %xmm0,%xmm0 # clear register bank 2169 pxor %xmm1,%xmm1 2170 pxor %xmm2,%xmm2 2171 pxor %xmm3,%xmm3 2172 pxor %xmm4,%xmm4 2173 pxor %xmm5,%xmm5 2174 ___ 2175 $code.=<<___ if (!$win64); 2176 pxor %xmm6,%xmm6 2177 pxor %xmm7,%xmm7 2178 movaps %xmm0,0x00(%rsp) # clear stack 2179 pxor %xmm8,%xmm8 2180 movaps %xmm0,0x10(%rsp) 2181 pxor %xmm9,%xmm9 2182 movaps %xmm0,0x20(%rsp) 2183 pxor %xmm10,%xmm10 2184 movaps %xmm0,0x30(%rsp) 2185 pxor %xmm11,%xmm11 2186 movaps %xmm0,0x40(%rsp) 2187 pxor %xmm12,%xmm12 2188 movaps %xmm0,0x50(%rsp) 2189 pxor %xmm13,%xmm13 2190 movaps %xmm0,0x60(%rsp) 2191 pxor %xmm14,%xmm14 2192 pxor %xmm15,%xmm15 2193 ___ 2194 $code.=<<___ if ($win64); 2195 movaps -0xa8(%r11),%xmm6 2196 movaps %xmm0,-0xa8(%r11) # clear stack 2197 movaps -0x98(%r11),%xmm7 2198 movaps %xmm0,-0x98(%r11) 2199 movaps -0x88(%r11),%xmm8 2200 movaps %xmm0,-0x88(%r11) 2201 movaps -0x78(%r11),%xmm9 2202 movaps %xmm0,-0x78(%r11) 2203 movaps -0x68(%r11),%xmm10 2204 movaps %xmm0,-0x68(%r11) 2205 movaps -0x58(%r11),%xmm11 2206 movaps %xmm0,-0x58(%r11) 2207 movaps -0x48(%r11),%xmm12 2208 movaps %xmm0,-0x48(%r11) 2209 movaps -0x38(%r11),%xmm13 2210 movaps %xmm0,-0x38(%r11) 2211 movaps -0x28(%r11),%xmm14 2212 movaps %xmm0,-0x28(%r11) 2213 movaps -0x18(%r11),%xmm15 2214 movaps %xmm0,-0x18(%r11) 2215 movaps %xmm0,0x00(%rsp) 2216 movaps %xmm0,0x10(%rsp) 2217 movaps %xmm0,0x20(%rsp) 2218 movaps %xmm0,0x30(%rsp) 2219 movaps %xmm0,0x40(%rsp) 2220 movaps %xmm0,0x50(%rsp) 2221 movaps %xmm0,0x60(%rsp) 2222 ___ 2223 $code.=<<___; 2224 mov -8(%r11),%rbp 2225 lea (%r11),%rsp 2226 .Lxts_enc_epilogue: 2227 ret 2228 .size aesni_xts_encrypt,.-aesni_xts_encrypt 2229 ___ 2230 2231 $code.=<<___; 2232 .globl aesni_xts_decrypt 2233 .type aesni_xts_decrypt,\@function,6 2234 .align 16 2235 aesni_xts_decrypt: 2236 lea (%rsp),%r11 # frame pointer 2237 push %rbp 2238 sub \$$frame_size,%rsp 2239 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2240 ___ 2241 $code.=<<___ if ($win64); 2242 movaps %xmm6,-0xa8(%r11) # offload everything 2243 movaps %xmm7,-0x98(%r11) 2244 movaps %xmm8,-0x88(%r11) 2245 movaps %xmm9,-0x78(%r11) 2246 movaps %xmm10,-0x68(%r11) 2247 movaps %xmm11,-0x58(%r11) 2248 movaps %xmm12,-0x48(%r11) 2249 movaps %xmm13,-0x38(%r11) 2250 movaps %xmm14,-0x28(%r11) 2251 movaps %xmm15,-0x18(%r11) 2252 .Lxts_dec_body: 2253 ___ 2254 $code.=<<___; 2255 movups ($ivp),$inout0 # load clear-text tweak 2256 mov 240($key2),$rounds # key2->rounds 2257 mov 240($key),$rnds_ # key1->rounds 2258 ___ 2259 # generate the tweak 2260 &aesni_generate1("enc",$key2,$rounds,$inout0); 2261 $code.=<<___; 2262 xor %eax,%eax # if ($len%16) len-=16; 2263 test \$15,$len 2264 setnz %al 2265 shl \$4,%rax 2266 sub %rax,$len 2267 2268 $movkey ($key),$rndkey0 # zero round key 2269 mov $key,$key_ # backup $key 2270 mov $rnds_,$rounds # backup $rounds 2271 shl \$4,$rnds_ 2272 mov $len,$len_ # backup $len 2273 and \$-16,$len 2274 2275 $movkey 16($key,$rnds_),$rndkey1 # last round key 2276 2277 movdqa .Lxts_magic(%rip),$twmask 2278 movdqa $inout0,@tweak[5] 2279 pshufd \$0x5f,$inout0,$twres 2280 pxor $rndkey0,$rndkey1 2281 ___ 2282 for ($i=0;$i<4;$i++) { 2283 $code.=<<___; 2284 movdqa $twres,$twtmp 2285 paddd $twres,$twres 2286 movdqa @tweak[5],@tweak[$i] 2287 psrad \$31,$twtmp # broadcast upper bits 2288 paddq @tweak[5],@tweak[5] 2289 pand $twmask,$twtmp 2290 pxor $rndkey0,@tweak[$i] 2291 pxor $twtmp,@tweak[5] 2292 ___ 2293 } 2294 $code.=<<___; 2295 movdqa @tweak[5],@tweak[4] 2296 psrad \$31,$twres 2297 paddq @tweak[5],@tweak[5] 2298 pand $twmask,$twres 2299 pxor $rndkey0,@tweak[4] 2300 pxor $twres,@tweak[5] 2301 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2302 2303 sub \$16*6,$len 2304 jc .Lxts_dec_short # if $len-=6*16 borrowed 2305 2306 mov \$16+96,$rounds 2307 lea 32($key_,$rnds_),$key # end of key schedule 2308 sub %r10,%rax # twisted $rounds 2309 $movkey 16($key_),$rndkey1 2310 mov %rax,%r10 # backup twisted $rounds 2311 lea .Lxts_magic(%rip),%r8 2312 jmp .Lxts_dec_grandloop 2313 2314 .align 32 2315 .Lxts_dec_grandloop: 2316 movdqu `16*0`($inp),$inout0 # load input 2317 movdqa $rndkey0,$twmask 2318 movdqu `16*1`($inp),$inout1 2319 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2320 movdqu `16*2`($inp),$inout2 2321 pxor @tweak[1],$inout1 2322 aesdec $rndkey1,$inout0 2323 movdqu `16*3`($inp),$inout3 2324 pxor @tweak[2],$inout2 2325 aesdec $rndkey1,$inout1 2326 movdqu `16*4`($inp),$inout4 2327 pxor @tweak[3],$inout3 2328 aesdec $rndkey1,$inout2 2329 movdqu `16*5`($inp),$inout5 2330 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2331 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2332 pxor @tweak[4],$inout4 2333 aesdec $rndkey1,$inout3 2334 $movkey 32($key_),$rndkey0 2335 lea `16*6`($inp),$inp 2336 pxor $twmask,$inout5 2337 2338 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 2339 aesdec $rndkey1,$inout4 2340 pxor $twres,@tweak[1] 2341 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2342 aesdec $rndkey1,$inout5 2343 $movkey 48($key_),$rndkey1 2344 pxor $twres,@tweak[2] 2345 2346 aesdec $rndkey0,$inout0 2347 pxor $twres,@tweak[3] 2348 movdqa @tweak[1],`16*1`(%rsp) 2349 aesdec $rndkey0,$inout1 2350 pxor $twres,@tweak[4] 2351 movdqa @tweak[2],`16*2`(%rsp) 2352 aesdec $rndkey0,$inout2 2353 aesdec $rndkey0,$inout3 2354 pxor $twres,$twmask 2355 movdqa @tweak[4],`16*4`(%rsp) 2356 aesdec $rndkey0,$inout4 2357 aesdec $rndkey0,$inout5 2358 $movkey 64($key_),$rndkey0 2359 movdqa $twmask,`16*5`(%rsp) 2360 pshufd \$0x5f,@tweak[5],$twres 2361 jmp .Lxts_dec_loop6 2362 .align 32 2363 .Lxts_dec_loop6: 2364 aesdec $rndkey1,$inout0 2365 aesdec $rndkey1,$inout1 2366 aesdec $rndkey1,$inout2 2367 aesdec $rndkey1,$inout3 2368 aesdec $rndkey1,$inout4 2369 aesdec $rndkey1,$inout5 2370 $movkey -64($key,%rax),$rndkey1 2371 add \$32,%rax 2372 2373 aesdec $rndkey0,$inout0 2374 aesdec $rndkey0,$inout1 2375 aesdec $rndkey0,$inout2 2376 aesdec $rndkey0,$inout3 2377 aesdec $rndkey0,$inout4 2378 aesdec $rndkey0,$inout5 2379 $movkey -80($key,%rax),$rndkey0 2380 jnz .Lxts_dec_loop6 2381 2382 movdqa (%r8),$twmask # start calculating next tweak 2383 movdqa $twres,$twtmp 2384 paddd $twres,$twres 2385 aesdec $rndkey1,$inout0 2386 paddq @tweak[5],@tweak[5] 2387 psrad \$31,$twtmp 2388 aesdec $rndkey1,$inout1 2389 pand $twmask,$twtmp 2390 $movkey ($key_),@tweak[0] # load round[0] 2391 aesdec $rndkey1,$inout2 2392 aesdec $rndkey1,$inout3 2393 aesdec $rndkey1,$inout4 2394 pxor $twtmp,@tweak[5] 2395 movaps @tweak[0],@tweak[1] # copy round[0] 2396 aesdec $rndkey1,$inout5 2397 $movkey -64($key),$rndkey1 2398 2399 movdqa $twres,$twtmp 2400 aesdec $rndkey0,$inout0 2401 paddd $twres,$twres 2402 pxor @tweak[5],@tweak[0] 2403 aesdec $rndkey0,$inout1 2404 psrad \$31,$twtmp 2405 paddq @tweak[5],@tweak[5] 2406 aesdec $rndkey0,$inout2 2407 aesdec $rndkey0,$inout3 2408 pand $twmask,$twtmp 2409 movaps @tweak[1],@tweak[2] 2410 aesdec $rndkey0,$inout4 2411 pxor $twtmp,@tweak[5] 2412 movdqa $twres,$twtmp 2413 aesdec $rndkey0,$inout5 2414 $movkey -48($key),$rndkey0 2415 2416 paddd $twres,$twres 2417 aesdec $rndkey1,$inout0 2418 pxor @tweak[5],@tweak[1] 2419 psrad \$31,$twtmp 2420 aesdec $rndkey1,$inout1 2421 paddq @tweak[5],@tweak[5] 2422 pand $twmask,$twtmp 2423 aesdec $rndkey1,$inout2 2424 aesdec $rndkey1,$inout3 2425 movdqa @tweak[3],`16*3`(%rsp) 2426 pxor $twtmp,@tweak[5] 2427 aesdec $rndkey1,$inout4 2428 movaps @tweak[2],@tweak[3] 2429 movdqa $twres,$twtmp 2430 aesdec $rndkey1,$inout5 2431 $movkey -32($key),$rndkey1 2432 2433 paddd $twres,$twres 2434 aesdec $rndkey0,$inout0 2435 pxor @tweak[5],@tweak[2] 2436 psrad \$31,$twtmp 2437 aesdec $rndkey0,$inout1 2438 paddq @tweak[5],@tweak[5] 2439 pand $twmask,$twtmp 2440 aesdec $rndkey0,$inout2 2441 aesdec $rndkey0,$inout3 2442 aesdec $rndkey0,$inout4 2443 pxor $twtmp,@tweak[5] 2444 movaps @tweak[3],@tweak[4] 2445 aesdec $rndkey0,$inout5 2446 2447 movdqa $twres,$rndkey0 2448 paddd $twres,$twres 2449 aesdec $rndkey1,$inout0 2450 pxor @tweak[5],@tweak[3] 2451 psrad \$31,$rndkey0 2452 aesdec $rndkey1,$inout1 2453 paddq @tweak[5],@tweak[5] 2454 pand $twmask,$rndkey0 2455 aesdec $rndkey1,$inout2 2456 aesdec $rndkey1,$inout3 2457 pxor $rndkey0,@tweak[5] 2458 $movkey ($key_),$rndkey0 2459 aesdec $rndkey1,$inout4 2460 aesdec $rndkey1,$inout5 2461 $movkey 16($key_),$rndkey1 2462 2463 pxor @tweak[5],@tweak[4] 2464 aesdeclast `16*0`(%rsp),$inout0 2465 psrad \$31,$twres 2466 paddq @tweak[5],@tweak[5] 2467 aesdeclast `16*1`(%rsp),$inout1 2468 aesdeclast `16*2`(%rsp),$inout2 2469 pand $twmask,$twres 2470 mov %r10,%rax # restore $rounds 2471 aesdeclast `16*3`(%rsp),$inout3 2472 aesdeclast `16*4`(%rsp),$inout4 2473 aesdeclast `16*5`(%rsp),$inout5 2474 pxor $twres,@tweak[5] 2475 2476 lea `16*6`($out),$out # $out+=6*16 2477 movups $inout0,`-16*6`($out) # store 6 output blocks 2478 movups $inout1,`-16*5`($out) 2479 movups $inout2,`-16*4`($out) 2480 movups $inout3,`-16*3`($out) 2481 movups $inout4,`-16*2`($out) 2482 movups $inout5,`-16*1`($out) 2483 sub \$16*6,$len 2484 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2485 2486 mov \$16+96,$rounds 2487 sub $rnds_,$rounds 2488 mov $key_,$key # restore $key 2489 shr \$4,$rounds # restore original value 2490 2491 .Lxts_dec_short: 2492 # at the point @tweak[0..5] are populated with tweak values 2493 mov $rounds,$rnds_ # backup $rounds 2494 pxor $rndkey0,@tweak[0] 2495 pxor $rndkey0,@tweak[1] 2496 add \$16*6,$len # restore real remaining $len 2497 jz .Lxts_dec_done # done if ($len==0) 2498 2499 pxor $rndkey0,@tweak[2] 2500 cmp \$0x20,$len 2501 jb .Lxts_dec_one # $len is 1*16 2502 pxor $rndkey0,@tweak[3] 2503 je .Lxts_dec_two # $len is 2*16 2504 2505 pxor $rndkey0,@tweak[4] 2506 cmp \$0x40,$len 2507 jb .Lxts_dec_three # $len is 3*16 2508 je .Lxts_dec_four # $len is 4*16 2509 2510 movdqu ($inp),$inout0 # $len is 5*16 2511 movdqu 16*1($inp),$inout1 2512 movdqu 16*2($inp),$inout2 2513 pxor @tweak[0],$inout0 2514 movdqu 16*3($inp),$inout3 2515 pxor @tweak[1],$inout1 2516 movdqu 16*4($inp),$inout4 2517 lea 16*5($inp),$inp # $inp+=5*16 2518 pxor @tweak[2],$inout2 2519 pxor @tweak[3],$inout3 2520 pxor @tweak[4],$inout4 2521 2522 call _aesni_decrypt6 2523 2524 xorps @tweak[0],$inout0 2525 xorps @tweak[1],$inout1 2526 xorps @tweak[2],$inout2 2527 movdqu $inout0,($out) # store 5 output blocks 2528 xorps @tweak[3],$inout3 2529 movdqu $inout1,16*1($out) 2530 xorps @tweak[4],$inout4 2531 movdqu $inout2,16*2($out) 2532 pxor $twtmp,$twtmp 2533 movdqu $inout3,16*3($out) 2534 pcmpgtd @tweak[5],$twtmp 2535 movdqu $inout4,16*4($out) 2536 lea 16*5($out),$out # $out+=5*16 2537 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2538 and \$15,$len_ 2539 jz .Lxts_dec_ret 2540 2541 movdqa @tweak[5],@tweak[0] 2542 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2543 pand $twmask,@tweak[1] # isolate carry and residue 2544 pxor @tweak[5],@tweak[1] 2545 jmp .Lxts_dec_done2 2546 2547 .align 16 2548 .Lxts_dec_one: 2549 movups ($inp),$inout0 2550 lea 16*1($inp),$inp # $inp+=1*16 2551 xorps @tweak[0],$inout0 2552 ___ 2553 &aesni_generate1("dec",$key,$rounds); 2554 $code.=<<___; 2555 xorps @tweak[0],$inout0 2556 movdqa @tweak[1],@tweak[0] 2557 movups $inout0,($out) # store one output block 2558 movdqa @tweak[2],@tweak[1] 2559 lea 16*1($out),$out # $out+=1*16 2560 jmp .Lxts_dec_done 2561 2562 .align 16 2563 .Lxts_dec_two: 2564 movups ($inp),$inout0 2565 movups 16($inp),$inout1 2566 lea 32($inp),$inp # $inp+=2*16 2567 xorps @tweak[0],$inout0 2568 xorps @tweak[1],$inout1 2569 2570 call _aesni_decrypt2 2571 2572 xorps @tweak[0],$inout0 2573 movdqa @tweak[2],@tweak[0] 2574 xorps @tweak[1],$inout1 2575 movdqa @tweak[3],@tweak[1] 2576 movups $inout0,($out) # store 2 output blocks 2577 movups $inout1,16*1($out) 2578 lea 16*2($out),$out # $out+=2*16 2579 jmp .Lxts_dec_done 2580 2581 .align 16 2582 .Lxts_dec_three: 2583 movups ($inp),$inout0 2584 movups 16*1($inp),$inout1 2585 movups 16*2($inp),$inout2 2586 lea 16*3($inp),$inp # $inp+=3*16 2587 xorps @tweak[0],$inout0 2588 xorps @tweak[1],$inout1 2589 xorps @tweak[2],$inout2 2590 2591 call _aesni_decrypt3 2592 2593 xorps @tweak[0],$inout0 2594 movdqa @tweak[3],@tweak[0] 2595 xorps @tweak[1],$inout1 2596 movdqa @tweak[4],@tweak[1] 2597 xorps @tweak[2],$inout2 2598 movups $inout0,($out) # store 3 output blocks 2599 movups $inout1,16*1($out) 2600 movups $inout2,16*2($out) 2601 lea 16*3($out),$out # $out+=3*16 2602 jmp .Lxts_dec_done 2603 2604 .align 16 2605 .Lxts_dec_four: 2606 movups ($inp),$inout0 2607 movups 16*1($inp),$inout1 2608 movups 16*2($inp),$inout2 2609 xorps @tweak[0],$inout0 2610 movups 16*3($inp),$inout3 2611 lea 16*4($inp),$inp # $inp+=4*16 2612 xorps @tweak[1],$inout1 2613 xorps @tweak[2],$inout2 2614 xorps @tweak[3],$inout3 2615 2616 call _aesni_decrypt4 2617 2618 pxor @tweak[0],$inout0 2619 movdqa @tweak[4],@tweak[0] 2620 pxor @tweak[1],$inout1 2621 movdqa @tweak[5],@tweak[1] 2622 pxor @tweak[2],$inout2 2623 movdqu $inout0,($out) # store 4 output blocks 2624 pxor @tweak[3],$inout3 2625 movdqu $inout1,16*1($out) 2626 movdqu $inout2,16*2($out) 2627 movdqu $inout3,16*3($out) 2628 lea 16*4($out),$out # $out+=4*16 2629 jmp .Lxts_dec_done 2630 2631 .align 16 2632 .Lxts_dec_done: 2633 and \$15,$len_ # see if $len%16 is 0 2634 jz .Lxts_dec_ret 2635 .Lxts_dec_done2: 2636 mov $len_,$len 2637 mov $key_,$key # restore $key 2638 mov $rnds_,$rounds # restore $rounds 2639 2640 movups ($inp),$inout0 2641 xorps @tweak[1],$inout0 2642 ___ 2643 &aesni_generate1("dec",$key,$rounds); 2644 $code.=<<___; 2645 xorps @tweak[1],$inout0 2646 movups $inout0,($out) 2647 2648 .Lxts_dec_steal: 2649 movzb 16($inp),%eax # borrow $rounds ... 2650 movzb ($out),%ecx # ... and $key 2651 lea 1($inp),$inp 2652 mov %al,($out) 2653 mov %cl,16($out) 2654 lea 1($out),$out 2655 sub \$1,$len 2656 jnz .Lxts_dec_steal 2657 2658 sub $len_,$out # rewind $out 2659 mov $key_,$key # restore $key 2660 mov $rnds_,$rounds # restore $rounds 2661 2662 movups ($out),$inout0 2663 xorps @tweak[0],$inout0 2664 ___ 2665 &aesni_generate1("dec",$key,$rounds); 2666 $code.=<<___; 2667 xorps @tweak[0],$inout0 2668 movups $inout0,($out) 2669 2670 .Lxts_dec_ret: 2671 xorps %xmm0,%xmm0 # clear register bank 2672 pxor %xmm1,%xmm1 2673 pxor %xmm2,%xmm2 2674 pxor %xmm3,%xmm3 2675 pxor %xmm4,%xmm4 2676 pxor %xmm5,%xmm5 2677 ___ 2678 $code.=<<___ if (!$win64); 2679 pxor %xmm6,%xmm6 2680 pxor %xmm7,%xmm7 2681 movaps %xmm0,0x00(%rsp) # clear stack 2682 pxor %xmm8,%xmm8 2683 movaps %xmm0,0x10(%rsp) 2684 pxor %xmm9,%xmm9 2685 movaps %xmm0,0x20(%rsp) 2686 pxor %xmm10,%xmm10 2687 movaps %xmm0,0x30(%rsp) 2688 pxor %xmm11,%xmm11 2689 movaps %xmm0,0x40(%rsp) 2690 pxor %xmm12,%xmm12 2691 movaps %xmm0,0x50(%rsp) 2692 pxor %xmm13,%xmm13 2693 movaps %xmm0,0x60(%rsp) 2694 pxor %xmm14,%xmm14 2695 pxor %xmm15,%xmm15 2696 ___ 2697 $code.=<<___ if ($win64); 2698 movaps -0xa8(%r11),%xmm6 2699 movaps %xmm0,-0xa8(%r11) # clear stack 2700 movaps -0x98(%r11),%xmm7 2701 movaps %xmm0,-0x98(%r11) 2702 movaps -0x88(%r11),%xmm8 2703 movaps %xmm0,-0x88(%r11) 2704 movaps -0x78(%r11),%xmm9 2705 movaps %xmm0,-0x78(%r11) 2706 movaps -0x68(%r11),%xmm10 2707 movaps %xmm0,-0x68(%r11) 2708 movaps -0x58(%r11),%xmm11 2709 movaps %xmm0,-0x58(%r11) 2710 movaps -0x48(%r11),%xmm12 2711 movaps %xmm0,-0x48(%r11) 2712 movaps -0x38(%r11),%xmm13 2713 movaps %xmm0,-0x38(%r11) 2714 movaps -0x28(%r11),%xmm14 2715 movaps %xmm0,-0x28(%r11) 2716 movaps -0x18(%r11),%xmm15 2717 movaps %xmm0,-0x18(%r11) 2718 movaps %xmm0,0x00(%rsp) 2719 movaps %xmm0,0x10(%rsp) 2720 movaps %xmm0,0x20(%rsp) 2721 movaps %xmm0,0x30(%rsp) 2722 movaps %xmm0,0x40(%rsp) 2723 movaps %xmm0,0x50(%rsp) 2724 movaps %xmm0,0x60(%rsp) 2725 ___ 2726 $code.=<<___; 2727 mov -8(%r11),%rbp 2728 lea (%r11),%rsp 2729 .Lxts_dec_epilogue: 2730 ret 2731 .size aesni_xts_decrypt,.-aesni_xts_decrypt 2732 ___ 2733 } 2734 2736 ###################################################################### 2737 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2738 # const AES_KEY *key, unsigned int start_block_num, 2739 # unsigned char offset_i[16], const unsigned char L_[][16], 2740 # unsigned char checksum[16]); 2741 # 2742 { 2743 my @offset=map("%xmm$_",(10..15)); 2744 my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2745 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2746 my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2747 my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2748 my $seventh_arg = $win64 ? 56 : 8; 2749 my $blocks = $len; 2750 2751 $code.=<<___; 2752 .globl aesni_ocb_encrypt 2753 .type aesni_ocb_encrypt,\@function,6 2754 .align 32 2755 aesni_ocb_encrypt: 2756 lea (%rsp),%rax 2757 push %rbx 2758 push %rbp 2759 push %r12 2760 push %r13 2761 push %r14 2762 ___ 2763 $code.=<<___ if ($win64); 2764 lea -0xa0(%rsp),%rsp 2765 movaps %xmm6,0x00(%rsp) # offload everything 2766 movaps %xmm7,0x10(%rsp) 2767 movaps %xmm8,0x20(%rsp) 2768 movaps %xmm9,0x30(%rsp) 2769 movaps %xmm10,0x40(%rsp) 2770 movaps %xmm11,0x50(%rsp) 2771 movaps %xmm12,0x60(%rsp) 2772 movaps %xmm13,0x70(%rsp) 2773 movaps %xmm14,0x80(%rsp) 2774 movaps %xmm15,0x90(%rsp) 2775 .Locb_enc_body: 2776 ___ 2777 $code.=<<___; 2778 mov $seventh_arg(%rax),$L_p # 7th argument 2779 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2780 2781 mov 240($key),$rnds_ 2782 mov $key,$key_ 2783 shl \$4,$rnds_ 2784 $movkey ($key),$rndkey0l # round[0] 2785 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2786 2787 movdqu ($offset_p),@offset[5] # load last offset_i 2788 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2789 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2790 2791 mov \$16+32,$rounds 2792 lea 32($key_,$rnds_),$key 2793 $movkey 16($key_),$rndkey1 # round[1] 2794 sub %r10,%rax # twisted $rounds 2795 mov %rax,%r10 # backup twisted $rounds 2796 2797 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2798 movdqu ($checksum_p),$checksum # load checksum 2799 2800 test \$1,$block_num # is first block number odd? 2801 jnz .Locb_enc_odd 2802 2803 bsf $block_num,$i1 2804 add \$1,$block_num 2805 shl \$4,$i1 2806 movdqu ($L_p,$i1),$inout5 # borrow 2807 movdqu ($inp),$inout0 2808 lea 16($inp),$inp 2809 2810 call __ocb_encrypt1 2811 2812 movdqa $inout5,@offset[5] 2813 movups $inout0,($out) 2814 lea 16($out),$out 2815 sub \$1,$blocks 2816 jz .Locb_enc_done 2817 2818 .Locb_enc_odd: 2819 lea 1($block_num),$i1 # even-numbered blocks 2820 lea 3($block_num),$i3 2821 lea 5($block_num),$i5 2822 lea 6($block_num),$block_num 2823 bsf $i1,$i1 # ntz(block) 2824 bsf $i3,$i3 2825 bsf $i5,$i5 2826 shl \$4,$i1 # ntz(block) -> table offset 2827 shl \$4,$i3 2828 shl \$4,$i5 2829 2830 sub \$6,$blocks 2831 jc .Locb_enc_short 2832 jmp .Locb_enc_grandloop 2833 2834 .align 32 2835 .Locb_enc_grandloop: 2836 movdqu `16*0`($inp),$inout0 # load input 2837 movdqu `16*1`($inp),$inout1 2838 movdqu `16*2`($inp),$inout2 2839 movdqu `16*3`($inp),$inout3 2840 movdqu `16*4`($inp),$inout4 2841 movdqu `16*5`($inp),$inout5 2842 lea `16*6`($inp),$inp 2843 2844 call __ocb_encrypt6 2845 2846 movups $inout0,`16*0`($out) # store output 2847 movups $inout1,`16*1`($out) 2848 movups $inout2,`16*2`($out) 2849 movups $inout3,`16*3`($out) 2850 movups $inout4,`16*4`($out) 2851 movups $inout5,`16*5`($out) 2852 lea `16*6`($out),$out 2853 sub \$6,$blocks 2854 jnc .Locb_enc_grandloop 2855 2856 .Locb_enc_short: 2857 add \$6,$blocks 2858 jz .Locb_enc_done 2859 2860 movdqu `16*0`($inp),$inout0 2861 cmp \$2,$blocks 2862 jb .Locb_enc_one 2863 movdqu `16*1`($inp),$inout1 2864 je .Locb_enc_two 2865 2866 movdqu `16*2`($inp),$inout2 2867 cmp \$4,$blocks 2868 jb .Locb_enc_three 2869 movdqu `16*3`($inp),$inout3 2870 je .Locb_enc_four 2871 2872 movdqu `16*4`($inp),$inout4 2873 pxor $inout5,$inout5 2874 2875 call __ocb_encrypt6 2876 2877 movdqa @offset[4],@offset[5] 2878 movups $inout0,`16*0`($out) 2879 movups $inout1,`16*1`($out) 2880 movups $inout2,`16*2`($out) 2881 movups $inout3,`16*3`($out) 2882 movups $inout4,`16*4`($out) 2883 2884 jmp .Locb_enc_done 2885 2886 .align 16 2887 .Locb_enc_one: 2888 movdqa @offset[0],$inout5 # borrow 2889 2890 call __ocb_encrypt1 2891 2892 movdqa $inout5,@offset[5] 2893 movups $inout0,`16*0`($out) 2894 jmp .Locb_enc_done 2895 2896 .align 16 2897 .Locb_enc_two: 2898 pxor $inout2,$inout2 2899 pxor $inout3,$inout3 2900 2901 call __ocb_encrypt4 2902 2903 movdqa @offset[1],@offset[5] 2904 movups $inout0,`16*0`($out) 2905 movups $inout1,`16*1`($out) 2906 2907 jmp .Locb_enc_done 2908 2909 .align 16 2910 .Locb_enc_three: 2911 pxor $inout3,$inout3 2912 2913 call __ocb_encrypt4 2914 2915 movdqa @offset[2],@offset[5] 2916 movups $inout0,`16*0`($out) 2917 movups $inout1,`16*1`($out) 2918 movups $inout2,`16*2`($out) 2919 2920 jmp .Locb_enc_done 2921 2922 .align 16 2923 .Locb_enc_four: 2924 call __ocb_encrypt4 2925 2926 movdqa @offset[3],@offset[5] 2927 movups $inout0,`16*0`($out) 2928 movups $inout1,`16*1`($out) 2929 movups $inout2,`16*2`($out) 2930 movups $inout3,`16*3`($out) 2931 2932 .Locb_enc_done: 2933 pxor $rndkey0,@offset[5] # "remove" round[last] 2934 movdqu $checksum,($checksum_p) # store checksum 2935 movdqu @offset[5],($offset_p) # store last offset_i 2936 2937 xorps %xmm0,%xmm0 # clear register bank 2938 pxor %xmm1,%xmm1 2939 pxor %xmm2,%xmm2 2940 pxor %xmm3,%xmm3 2941 pxor %xmm4,%xmm4 2942 pxor %xmm5,%xmm5 2943 ___ 2944 $code.=<<___ if (!$win64); 2945 pxor %xmm6,%xmm6 2946 pxor %xmm7,%xmm7 2947 pxor %xmm8,%xmm8 2948 pxor %xmm9,%xmm9 2949 pxor %xmm10,%xmm10 2950 pxor %xmm11,%xmm11 2951 pxor %xmm12,%xmm12 2952 pxor %xmm13,%xmm13 2953 pxor %xmm14,%xmm14 2954 pxor %xmm15,%xmm15 2955 lea 0x28(%rsp),%rax 2956 ___ 2957 $code.=<<___ if ($win64); 2958 movaps 0x00(%rsp),%xmm6 2959 movaps %xmm0,0x00(%rsp) # clear stack 2960 movaps 0x10(%rsp),%xmm7 2961 movaps %xmm0,0x10(%rsp) 2962 movaps 0x20(%rsp),%xmm8 2963 movaps %xmm0,0x20(%rsp) 2964 movaps 0x30(%rsp),%xmm9 2965 movaps %xmm0,0x30(%rsp) 2966 movaps 0x40(%rsp),%xmm10 2967 movaps %xmm0,0x40(%rsp) 2968 movaps 0x50(%rsp),%xmm11 2969 movaps %xmm0,0x50(%rsp) 2970 movaps 0x60(%rsp),%xmm12 2971 movaps %xmm0,0x60(%rsp) 2972 movaps 0x70(%rsp),%xmm13 2973 movaps %xmm0,0x70(%rsp) 2974 movaps 0x80(%rsp),%xmm14 2975 movaps %xmm0,0x80(%rsp) 2976 movaps 0x90(%rsp),%xmm15 2977 movaps %xmm0,0x90(%rsp) 2978 lea 0xa0+0x28(%rsp),%rax 2979 .Locb_enc_pop: 2980 ___ 2981 $code.=<<___; 2982 mov -40(%rax),%r14 2983 mov -32(%rax),%r13 2984 mov -24(%rax),%r12 2985 mov -16(%rax),%rbp 2986 mov -8(%rax),%rbx 2987 lea (%rax),%rsp 2988 .Locb_enc_epilogue: 2989 ret 2990 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt 2991 2992 .type __ocb_encrypt6,\@abi-omnipotent 2993 .align 32 2994 __ocb_encrypt6: 2995 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 2996 movdqu ($L_p,$i1),@offset[1] 2997 movdqa @offset[0],@offset[2] 2998 movdqu ($L_p,$i3),@offset[3] 2999 movdqa @offset[0],@offset[4] 3000 pxor @offset[5],@offset[0] 3001 movdqu ($L_p,$i5),@offset[5] 3002 pxor @offset[0],@offset[1] 3003 pxor $inout0,$checksum # accumulate checksum 3004 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3005 pxor @offset[1],@offset[2] 3006 pxor $inout1,$checksum 3007 pxor @offset[1],$inout1 3008 pxor @offset[2],@offset[3] 3009 pxor $inout2,$checksum 3010 pxor @offset[2],$inout2 3011 pxor @offset[3],@offset[4] 3012 pxor $inout3,$checksum 3013 pxor @offset[3],$inout3 3014 pxor @offset[4],@offset[5] 3015 pxor $inout4,$checksum 3016 pxor @offset[4],$inout4 3017 pxor $inout5,$checksum 3018 pxor @offset[5],$inout5 3019 $movkey 32($key_),$rndkey0 3020 3021 lea 1($block_num),$i1 # even-numbered blocks 3022 lea 3($block_num),$i3 3023 lea 5($block_num),$i5 3024 add \$6,$block_num 3025 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3026 bsf $i1,$i1 # ntz(block) 3027 bsf $i3,$i3 3028 bsf $i5,$i5 3029 3030 aesenc $rndkey1,$inout0 3031 aesenc $rndkey1,$inout1 3032 aesenc $rndkey1,$inout2 3033 aesenc $rndkey1,$inout3 3034 pxor $rndkey0l,@offset[1] 3035 pxor $rndkey0l,@offset[2] 3036 aesenc $rndkey1,$inout4 3037 pxor $rndkey0l,@offset[3] 3038 pxor $rndkey0l,@offset[4] 3039 aesenc $rndkey1,$inout5 3040 $movkey 48($key_),$rndkey1 3041 pxor $rndkey0l,@offset[5] 3042 3043 aesenc $rndkey0,$inout0 3044 aesenc $rndkey0,$inout1 3045 aesenc $rndkey0,$inout2 3046 aesenc $rndkey0,$inout3 3047 aesenc $rndkey0,$inout4 3048 aesenc $rndkey0,$inout5 3049 $movkey 64($key_),$rndkey0 3050 shl \$4,$i1 # ntz(block) -> table offset 3051 shl \$4,$i3 3052 jmp .Locb_enc_loop6 3053 3054 .align 32 3055 .Locb_enc_loop6: 3056 aesenc $rndkey1,$inout0 3057 aesenc $rndkey1,$inout1 3058 aesenc $rndkey1,$inout2 3059 aesenc $rndkey1,$inout3 3060 aesenc $rndkey1,$inout4 3061 aesenc $rndkey1,$inout5 3062 $movkey ($key,%rax),$rndkey1 3063 add \$32,%rax 3064 3065 aesenc $rndkey0,$inout0 3066 aesenc $rndkey0,$inout1 3067 aesenc $rndkey0,$inout2 3068 aesenc $rndkey0,$inout3 3069 aesenc $rndkey0,$inout4 3070 aesenc $rndkey0,$inout5 3071 $movkey -16($key,%rax),$rndkey0 3072 jnz .Locb_enc_loop6 3073 3074 aesenc $rndkey1,$inout0 3075 aesenc $rndkey1,$inout1 3076 aesenc $rndkey1,$inout2 3077 aesenc $rndkey1,$inout3 3078 aesenc $rndkey1,$inout4 3079 aesenc $rndkey1,$inout5 3080 $movkey 16($key_),$rndkey1 3081 shl \$4,$i5 3082 3083 aesenclast @offset[0],$inout0 3084 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3085 mov %r10,%rax # restore twisted rounds 3086 aesenclast @offset[1],$inout1 3087 aesenclast @offset[2],$inout2 3088 aesenclast @offset[3],$inout3 3089 aesenclast @offset[4],$inout4 3090 aesenclast @offset[5],$inout5 3091 ret 3092 .size __ocb_encrypt6,.-__ocb_encrypt6 3093 3094 .type __ocb_encrypt4,\@abi-omnipotent 3095 .align 32 3096 __ocb_encrypt4: 3097 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3098 movdqu ($L_p,$i1),@offset[1] 3099 movdqa @offset[0],@offset[2] 3100 movdqu ($L_p,$i3),@offset[3] 3101 pxor @offset[5],@offset[0] 3102 pxor @offset[0],@offset[1] 3103 pxor $inout0,$checksum # accumulate checksum 3104 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3105 pxor @offset[1],@offset[2] 3106 pxor $inout1,$checksum 3107 pxor @offset[1],$inout1 3108 pxor @offset[2],@offset[3] 3109 pxor $inout2,$checksum 3110 pxor @offset[2],$inout2 3111 pxor $inout3,$checksum 3112 pxor @offset[3],$inout3 3113 $movkey 32($key_),$rndkey0 3114 3115 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3116 pxor $rndkey0l,@offset[1] 3117 pxor $rndkey0l,@offset[2] 3118 pxor $rndkey0l,@offset[3] 3119 3120 aesenc $rndkey1,$inout0 3121 aesenc $rndkey1,$inout1 3122 aesenc $rndkey1,$inout2 3123 aesenc $rndkey1,$inout3 3124 $movkey 48($key_),$rndkey1 3125 3126 aesenc $rndkey0,$inout0 3127 aesenc $rndkey0,$inout1 3128 aesenc $rndkey0,$inout2 3129 aesenc $rndkey0,$inout3 3130 $movkey 64($key_),$rndkey0 3131 jmp .Locb_enc_loop4 3132 3133 .align 32 3134 .Locb_enc_loop4: 3135 aesenc $rndkey1,$inout0 3136 aesenc $rndkey1,$inout1 3137 aesenc $rndkey1,$inout2 3138 aesenc $rndkey1,$inout3 3139 $movkey ($key,%rax),$rndkey1 3140 add \$32,%rax 3141 3142 aesenc $rndkey0,$inout0 3143 aesenc $rndkey0,$inout1 3144 aesenc $rndkey0,$inout2 3145 aesenc $rndkey0,$inout3 3146 $movkey -16($key,%rax),$rndkey0 3147 jnz .Locb_enc_loop4 3148 3149 aesenc $rndkey1,$inout0 3150 aesenc $rndkey1,$inout1 3151 aesenc $rndkey1,$inout2 3152 aesenc $rndkey1,$inout3 3153 $movkey 16($key_),$rndkey1 3154 mov %r10,%rax # restore twisted rounds 3155 3156 aesenclast @offset[0],$inout0 3157 aesenclast @offset[1],$inout1 3158 aesenclast @offset[2],$inout2 3159 aesenclast @offset[3],$inout3 3160 ret 3161 .size __ocb_encrypt4,.-__ocb_encrypt4 3162 3163 .type __ocb_encrypt1,\@abi-omnipotent 3164 .align 32 3165 __ocb_encrypt1: 3166 pxor @offset[5],$inout5 # offset_i 3167 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3168 pxor $inout0,$checksum # accumulate checksum 3169 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3170 $movkey 32($key_),$rndkey0 3171 3172 aesenc $rndkey1,$inout0 3173 $movkey 48($key_),$rndkey1 3174 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3175 3176 aesenc $rndkey0,$inout0 3177 $movkey 64($key_),$rndkey0 3178 jmp .Locb_enc_loop1 3179 3180 .align 32 3181 .Locb_enc_loop1: 3182 aesenc $rndkey1,$inout0 3183 $movkey ($key,%rax),$rndkey1 3184 add \$32,%rax 3185 3186 aesenc $rndkey0,$inout0 3187 $movkey -16($key,%rax),$rndkey0 3188 jnz .Locb_enc_loop1 3189 3190 aesenc $rndkey1,$inout0 3191 $movkey 16($key_),$rndkey1 # redundant in tail 3192 mov %r10,%rax # restore twisted rounds 3193 3194 aesenclast $inout5,$inout0 3195 ret 3196 .size __ocb_encrypt1,.-__ocb_encrypt1 3197 3198 .globl aesni_ocb_decrypt 3199 .type aesni_ocb_decrypt,\@function,6 3200 .align 32 3201 aesni_ocb_decrypt: 3202 lea (%rsp),%rax 3203 push %rbx 3204 push %rbp 3205 push %r12 3206 push %r13 3207 push %r14 3208 ___ 3209 $code.=<<___ if ($win64); 3210 lea -0xa0(%rsp),%rsp 3211 movaps %xmm6,0x00(%rsp) # offload everything 3212 movaps %xmm7,0x10(%rsp) 3213 movaps %xmm8,0x20(%rsp) 3214 movaps %xmm9,0x30(%rsp) 3215 movaps %xmm10,0x40(%rsp) 3216 movaps %xmm11,0x50(%rsp) 3217 movaps %xmm12,0x60(%rsp) 3218 movaps %xmm13,0x70(%rsp) 3219 movaps %xmm14,0x80(%rsp) 3220 movaps %xmm15,0x90(%rsp) 3221 .Locb_dec_body: 3222 ___ 3223 $code.=<<___; 3224 mov $seventh_arg(%rax),$L_p # 7th argument 3225 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3226 3227 mov 240($key),$rnds_ 3228 mov $key,$key_ 3229 shl \$4,$rnds_ 3230 $movkey ($key),$rndkey0l # round[0] 3231 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3232 3233 movdqu ($offset_p),@offset[5] # load last offset_i 3234 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3235 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3236 3237 mov \$16+32,$rounds 3238 lea 32($key_,$rnds_),$key 3239 $movkey 16($key_),$rndkey1 # round[1] 3240 sub %r10,%rax # twisted $rounds 3241 mov %rax,%r10 # backup twisted $rounds 3242 3243 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3244 movdqu ($checksum_p),$checksum # load checksum 3245 3246 test \$1,$block_num # is first block number odd? 3247 jnz .Locb_dec_odd 3248 3249 bsf $block_num,$i1 3250 add \$1,$block_num 3251 shl \$4,$i1 3252 movdqu ($L_p,$i1),$inout5 # borrow 3253 movdqu ($inp),$inout0 3254 lea 16($inp),$inp 3255 3256 call __ocb_decrypt1 3257 3258 movdqa $inout5,@offset[5] 3259 movups $inout0,($out) 3260 xorps $inout0,$checksum # accumulate checksum 3261 lea 16($out),$out 3262 sub \$1,$blocks 3263 jz .Locb_dec_done 3264 3265 .Locb_dec_odd: 3266 lea 1($block_num),$i1 # even-numbered blocks 3267 lea 3($block_num),$i3 3268 lea 5($block_num),$i5 3269 lea 6($block_num),$block_num 3270 bsf $i1,$i1 # ntz(block) 3271 bsf $i3,$i3 3272 bsf $i5,$i5 3273 shl \$4,$i1 # ntz(block) -> table offset 3274 shl \$4,$i3 3275 shl \$4,$i5 3276 3277 sub \$6,$blocks 3278 jc .Locb_dec_short 3279 jmp .Locb_dec_grandloop 3280 3281 .align 32 3282 .Locb_dec_grandloop: 3283 movdqu `16*0`($inp),$inout0 # load input 3284 movdqu `16*1`($inp),$inout1 3285 movdqu `16*2`($inp),$inout2 3286 movdqu `16*3`($inp),$inout3 3287 movdqu `16*4`($inp),$inout4 3288 movdqu `16*5`($inp),$inout5 3289 lea `16*6`($inp),$inp 3290 3291 call __ocb_decrypt6 3292 3293 movups $inout0,`16*0`($out) # store output 3294 pxor $inout0,$checksum # accumulate checksum 3295 movups $inout1,`16*1`($out) 3296 pxor $inout1,$checksum 3297 movups $inout2,`16*2`($out) 3298 pxor $inout2,$checksum 3299 movups $inout3,`16*3`($out) 3300 pxor $inout3,$checksum 3301 movups $inout4,`16*4`($out) 3302 pxor $inout4,$checksum 3303 movups $inout5,`16*5`($out) 3304 pxor $inout5,$checksum 3305 lea `16*6`($out),$out 3306 sub \$6,$blocks 3307 jnc .Locb_dec_grandloop 3308 3309 .Locb_dec_short: 3310 add \$6,$blocks 3311 jz .Locb_dec_done 3312 3313 movdqu `16*0`($inp),$inout0 3314 cmp \$2,$blocks 3315 jb .Locb_dec_one 3316 movdqu `16*1`($inp),$inout1 3317 je .Locb_dec_two 3318 3319 movdqu `16*2`($inp),$inout2 3320 cmp \$4,$blocks 3321 jb .Locb_dec_three 3322 movdqu `16*3`($inp),$inout3 3323 je .Locb_dec_four 3324 3325 movdqu `16*4`($inp),$inout4 3326 pxor $inout5,$inout5 3327 3328 call __ocb_decrypt6 3329 3330 movdqa @offset[4],@offset[5] 3331 movups $inout0,`16*0`($out) # store output 3332 pxor $inout0,$checksum # accumulate checksum 3333 movups $inout1,`16*1`($out) 3334 pxor $inout1,$checksum 3335 movups $inout2,`16*2`($out) 3336 pxor $inout2,$checksum 3337 movups $inout3,`16*3`($out) 3338 pxor $inout3,$checksum 3339 movups $inout4,`16*4`($out) 3340 pxor $inout4,$checksum 3341 3342 jmp .Locb_dec_done 3343 3344 .align 16 3345 .Locb_dec_one: 3346 movdqa @offset[0],$inout5 # borrow 3347 3348 call __ocb_decrypt1 3349 3350 movdqa $inout5,@offset[5] 3351 movups $inout0,`16*0`($out) # store output 3352 xorps $inout0,$checksum # accumulate checksum 3353 jmp .Locb_dec_done 3354 3355 .align 16 3356 .Locb_dec_two: 3357 pxor $inout2,$inout2 3358 pxor $inout3,$inout3 3359 3360 call __ocb_decrypt4 3361 3362 movdqa @offset[1],@offset[5] 3363 movups $inout0,`16*0`($out) # store output 3364 xorps $inout0,$checksum # accumulate checksum 3365 movups $inout1,`16*1`($out) 3366 xorps $inout1,$checksum 3367 3368 jmp .Locb_dec_done 3369 3370 .align 16 3371 .Locb_dec_three: 3372 pxor $inout3,$inout3 3373 3374 call __ocb_decrypt4 3375 3376 movdqa @offset[2],@offset[5] 3377 movups $inout0,`16*0`($out) # store output 3378 xorps $inout0,$checksum # accumulate checksum 3379 movups $inout1,`16*1`($out) 3380 xorps $inout1,$checksum 3381 movups $inout2,`16*2`($out) 3382 xorps $inout2,$checksum 3383 3384 jmp .Locb_dec_done 3385 3386 .align 16 3387 .Locb_dec_four: 3388 call __ocb_decrypt4 3389 3390 movdqa @offset[3],@offset[5] 3391 movups $inout0,`16*0`($out) # store output 3392 pxor $inout0,$checksum # accumulate checksum 3393 movups $inout1,`16*1`($out) 3394 pxor $inout1,$checksum 3395 movups $inout2,`16*2`($out) 3396 pxor $inout2,$checksum 3397 movups $inout3,`16*3`($out) 3398 pxor $inout3,$checksum 3399 3400 .Locb_dec_done: 3401 pxor $rndkey0,@offset[5] # "remove" round[last] 3402 movdqu $checksum,($checksum_p) # store checksum 3403 movdqu @offset[5],($offset_p) # store last offset_i 3404 3405 xorps %xmm0,%xmm0 # clear register bank 3406 pxor %xmm1,%xmm1 3407 pxor %xmm2,%xmm2 3408 pxor %xmm3,%xmm3 3409 pxor %xmm4,%xmm4 3410 pxor %xmm5,%xmm5 3411 ___ 3412 $code.=<<___ if (!$win64); 3413 pxor %xmm6,%xmm6 3414 pxor %xmm7,%xmm7 3415 pxor %xmm8,%xmm8 3416 pxor %xmm9,%xmm9 3417 pxor %xmm10,%xmm10 3418 pxor %xmm11,%xmm11 3419 pxor %xmm12,%xmm12 3420 pxor %xmm13,%xmm13 3421 pxor %xmm14,%xmm14 3422 pxor %xmm15,%xmm15 3423 lea 0x28(%rsp),%rax 3424 ___ 3425 $code.=<<___ if ($win64); 3426 movaps 0x00(%rsp),%xmm6 3427 movaps %xmm0,0x00(%rsp) # clear stack 3428 movaps 0x10(%rsp),%xmm7 3429 movaps %xmm0,0x10(%rsp) 3430 movaps 0x20(%rsp),%xmm8 3431 movaps %xmm0,0x20(%rsp) 3432 movaps 0x30(%rsp),%xmm9 3433 movaps %xmm0,0x30(%rsp) 3434 movaps 0x40(%rsp),%xmm10 3435 movaps %xmm0,0x40(%rsp) 3436 movaps 0x50(%rsp),%xmm11 3437 movaps %xmm0,0x50(%rsp) 3438 movaps 0x60(%rsp),%xmm12 3439 movaps %xmm0,0x60(%rsp) 3440 movaps 0x70(%rsp),%xmm13 3441 movaps %xmm0,0x70(%rsp) 3442 movaps 0x80(%rsp),%xmm14 3443 movaps %xmm0,0x80(%rsp) 3444 movaps 0x90(%rsp),%xmm15 3445 movaps %xmm0,0x90(%rsp) 3446 lea 0xa0+0x28(%rsp),%rax 3447 .Locb_dec_pop: 3448 ___ 3449 $code.=<<___; 3450 mov -40(%rax),%r14 3451 mov -32(%rax),%r13 3452 mov -24(%rax),%r12 3453 mov -16(%rax),%rbp 3454 mov -8(%rax),%rbx 3455 lea (%rax),%rsp 3456 .Locb_dec_epilogue: 3457 ret 3458 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3459 3460 .type __ocb_decrypt6,\@abi-omnipotent 3461 .align 32 3462 __ocb_decrypt6: 3463 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3464 movdqu ($L_p,$i1),@offset[1] 3465 movdqa @offset[0],@offset[2] 3466 movdqu ($L_p,$i3),@offset[3] 3467 movdqa @offset[0],@offset[4] 3468 pxor @offset[5],@offset[0] 3469 movdqu ($L_p,$i5),@offset[5] 3470 pxor @offset[0],@offset[1] 3471 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3472 pxor @offset[1],@offset[2] 3473 pxor @offset[1],$inout1 3474 pxor @offset[2],@offset[3] 3475 pxor @offset[2],$inout2 3476 pxor @offset[3],@offset[4] 3477 pxor @offset[3],$inout3 3478 pxor @offset[4],@offset[5] 3479 pxor @offset[4],$inout4 3480 pxor @offset[5],$inout5 3481 $movkey 32($key_),$rndkey0 3482 3483 lea 1($block_num),$i1 # even-numbered blocks 3484 lea 3($block_num),$i3 3485 lea 5($block_num),$i5 3486 add \$6,$block_num 3487 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3488 bsf $i1,$i1 # ntz(block) 3489 bsf $i3,$i3 3490 bsf $i5,$i5 3491 3492 aesdec $rndkey1,$inout0 3493 aesdec $rndkey1,$inout1 3494 aesdec $rndkey1,$inout2 3495 aesdec $rndkey1,$inout3 3496 pxor $rndkey0l,@offset[1] 3497 pxor $rndkey0l,@offset[2] 3498 aesdec $rndkey1,$inout4 3499 pxor $rndkey0l,@offset[3] 3500 pxor $rndkey0l,@offset[4] 3501 aesdec $rndkey1,$inout5 3502 $movkey 48($key_),$rndkey1 3503 pxor $rndkey0l,@offset[5] 3504 3505 aesdec $rndkey0,$inout0 3506 aesdec $rndkey0,$inout1 3507 aesdec $rndkey0,$inout2 3508 aesdec $rndkey0,$inout3 3509 aesdec $rndkey0,$inout4 3510 aesdec $rndkey0,$inout5 3511 $movkey 64($key_),$rndkey0 3512 shl \$4,$i1 # ntz(block) -> table offset 3513 shl \$4,$i3 3514 jmp .Locb_dec_loop6 3515 3516 .align 32 3517 .Locb_dec_loop6: 3518 aesdec $rndkey1,$inout0 3519 aesdec $rndkey1,$inout1 3520 aesdec $rndkey1,$inout2 3521 aesdec $rndkey1,$inout3 3522 aesdec $rndkey1,$inout4 3523 aesdec $rndkey1,$inout5 3524 $movkey ($key,%rax),$rndkey1 3525 add \$32,%rax 3526 3527 aesdec $rndkey0,$inout0 3528 aesdec $rndkey0,$inout1 3529 aesdec $rndkey0,$inout2 3530 aesdec $rndkey0,$inout3 3531 aesdec $rndkey0,$inout4 3532 aesdec $rndkey0,$inout5 3533 $movkey -16($key,%rax),$rndkey0 3534 jnz .Locb_dec_loop6 3535 3536 aesdec $rndkey1,$inout0 3537 aesdec $rndkey1,$inout1 3538 aesdec $rndkey1,$inout2 3539 aesdec $rndkey1,$inout3 3540 aesdec $rndkey1,$inout4 3541 aesdec $rndkey1,$inout5 3542 $movkey 16($key_),$rndkey1 3543 shl \$4,$i5 3544 3545 aesdeclast @offset[0],$inout0 3546 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3547 mov %r10,%rax # restore twisted rounds 3548 aesdeclast @offset[1],$inout1 3549 aesdeclast @offset[2],$inout2 3550 aesdeclast @offset[3],$inout3 3551 aesdeclast @offset[4],$inout4 3552 aesdeclast @offset[5],$inout5 3553 ret 3554 .size __ocb_decrypt6,.-__ocb_decrypt6 3555 3556 .type __ocb_decrypt4,\@abi-omnipotent 3557 .align 32 3558 __ocb_decrypt4: 3559 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3560 movdqu ($L_p,$i1),@offset[1] 3561 movdqa @offset[0],@offset[2] 3562 movdqu ($L_p,$i3),@offset[3] 3563 pxor @offset[5],@offset[0] 3564 pxor @offset[0],@offset[1] 3565 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3566 pxor @offset[1],@offset[2] 3567 pxor @offset[1],$inout1 3568 pxor @offset[2],@offset[3] 3569 pxor @offset[2],$inout2 3570 pxor @offset[3],$inout3 3571 $movkey 32($key_),$rndkey0 3572 3573 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3574 pxor $rndkey0l,@offset[1] 3575 pxor $rndkey0l,@offset[2] 3576 pxor $rndkey0l,@offset[3] 3577 3578 aesdec $rndkey1,$inout0 3579 aesdec $rndkey1,$inout1 3580 aesdec $rndkey1,$inout2 3581 aesdec $rndkey1,$inout3 3582 $movkey 48($key_),$rndkey1 3583 3584 aesdec $rndkey0,$inout0 3585 aesdec $rndkey0,$inout1 3586 aesdec $rndkey0,$inout2 3587 aesdec $rndkey0,$inout3 3588 $movkey 64($key_),$rndkey0 3589 jmp .Locb_dec_loop4 3590 3591 .align 32 3592 .Locb_dec_loop4: 3593 aesdec $rndkey1,$inout0 3594 aesdec $rndkey1,$inout1 3595 aesdec $rndkey1,$inout2 3596 aesdec $rndkey1,$inout3 3597 $movkey ($key,%rax),$rndkey1 3598 add \$32,%rax 3599 3600 aesdec $rndkey0,$inout0 3601 aesdec $rndkey0,$inout1 3602 aesdec $rndkey0,$inout2 3603 aesdec $rndkey0,$inout3 3604 $movkey -16($key,%rax),$rndkey0 3605 jnz .Locb_dec_loop4 3606 3607 aesdec $rndkey1,$inout0 3608 aesdec $rndkey1,$inout1 3609 aesdec $rndkey1,$inout2 3610 aesdec $rndkey1,$inout3 3611 $movkey 16($key_),$rndkey1 3612 mov %r10,%rax # restore twisted rounds 3613 3614 aesdeclast @offset[0],$inout0 3615 aesdeclast @offset[1],$inout1 3616 aesdeclast @offset[2],$inout2 3617 aesdeclast @offset[3],$inout3 3618 ret 3619 .size __ocb_decrypt4,.-__ocb_decrypt4 3620 3621 .type __ocb_decrypt1,\@abi-omnipotent 3622 .align 32 3623 __ocb_decrypt1: 3624 pxor @offset[5],$inout5 # offset_i 3625 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3626 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3627 $movkey 32($key_),$rndkey0 3628 3629 aesdec $rndkey1,$inout0 3630 $movkey 48($key_),$rndkey1 3631 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3632 3633 aesdec $rndkey0,$inout0 3634 $movkey 64($key_),$rndkey0 3635 jmp .Locb_dec_loop1 3636 3637 .align 32 3638 .Locb_dec_loop1: 3639 aesdec $rndkey1,$inout0 3640 $movkey ($key,%rax),$rndkey1 3641 add \$32,%rax 3642 3643 aesdec $rndkey0,$inout0 3644 $movkey -16($key,%rax),$rndkey0 3645 jnz .Locb_dec_loop1 3646 3647 aesdec $rndkey1,$inout0 3648 $movkey 16($key_),$rndkey1 # redundant in tail 3649 mov %r10,%rax # restore twisted rounds 3650 3651 aesdeclast $inout5,$inout0 3652 ret 3653 .size __ocb_decrypt1,.-__ocb_decrypt1 3654 ___ 3655 } }} 3656 3658 ######################################################################## 3659 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 3660 # size_t length, const AES_KEY *key, 3661 # unsigned char *ivp,const int enc); 3662 { 3663 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3664 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3665 3666 $code.=<<___; 3667 .globl ${PREFIX}_cbc_encrypt 3668 .type ${PREFIX}_cbc_encrypt,\@function,6 3669 .align 16 3670 ${PREFIX}_cbc_encrypt: 3671 test $len,$len # check length 3672 jz .Lcbc_ret 3673 3674 mov 240($key),$rnds_ # key->rounds 3675 mov $key,$key_ # backup $key 3676 test %r9d,%r9d # 6th argument 3677 jz .Lcbc_decrypt 3678 #--------------------------- CBC ENCRYPT ------------------------------# 3679 movups ($ivp),$inout0 # load iv as initial state 3680 mov $rnds_,$rounds 3681 cmp \$16,$len 3682 jb .Lcbc_enc_tail 3683 sub \$16,$len 3684 jmp .Lcbc_enc_loop 3685 .align 16 3686 .Lcbc_enc_loop: 3687 movups ($inp),$inout1 # load input 3688 lea 16($inp),$inp 3689 #xorps $inout1,$inout0 3690 ___ 3691 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3692 $code.=<<___; 3693 mov $rnds_,$rounds # restore $rounds 3694 mov $key_,$key # restore $key 3695 movups $inout0,0($out) # store output 3696 lea 16($out),$out 3697 sub \$16,$len 3698 jnc .Lcbc_enc_loop 3699 add \$16,$len 3700 jnz .Lcbc_enc_tail 3701 pxor $rndkey0,$rndkey0 # clear register bank 3702 pxor $rndkey1,$rndkey1 3703 movups $inout0,($ivp) 3704 pxor $inout0,$inout0 3705 pxor $inout1,$inout1 3706 jmp .Lcbc_ret 3707 3708 .Lcbc_enc_tail: 3709 mov $len,%rcx # zaps $key 3710 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3711 .long 0x9066A4F3 # rep movsb 3712 mov \$16,%ecx # zero tail 3713 sub $len,%rcx 3714 xor %eax,%eax 3715 .long 0x9066AAF3 # rep stosb 3716 lea -16(%rdi),%rdi # rewind $out by 1 block 3717 mov $rnds_,$rounds # restore $rounds 3718 mov %rdi,%rsi # $inp and $out are the same 3719 mov $key_,$key # restore $key 3720 xor $len,$len # len=16 3721 jmp .Lcbc_enc_loop # one more spin 3722 #--------------------------- CBC DECRYPT ------------------------------# 3724 .align 16 3725 .Lcbc_decrypt: 3726 cmp \$16,$len 3727 jne .Lcbc_decrypt_bulk 3728 3729 # handle single block without allocating stack frame, 3730 # useful in ciphertext stealing mode 3731 movdqu ($inp),$inout0 # load input 3732 movdqu ($ivp),$inout1 # load iv 3733 movdqa $inout0,$inout2 # future iv 3734 ___ 3735 &aesni_generate1("dec",$key,$rnds_); 3736 $code.=<<___; 3737 pxor $rndkey0,$rndkey0 # clear register bank 3738 pxor $rndkey1,$rndkey1 3739 movdqu $inout2,($ivp) # store iv 3740 xorps $inout1,$inout0 # ^=iv 3741 pxor $inout1,$inout1 3742 movups $inout0,($out) # store output 3743 pxor $inout0,$inout0 3744 jmp .Lcbc_ret 3745 .align 16 3746 .Lcbc_decrypt_bulk: 3747 lea (%rsp),%r11 # frame pointer 3748 push %rbp 3749 sub \$$frame_size,%rsp 3750 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3751 ___ 3752 $code.=<<___ if ($win64); 3753 movaps %xmm6,0x10(%rsp) 3754 movaps %xmm7,0x20(%rsp) 3755 movaps %xmm8,0x30(%rsp) 3756 movaps %xmm9,0x40(%rsp) 3757 movaps %xmm10,0x50(%rsp) 3758 movaps %xmm11,0x60(%rsp) 3759 movaps %xmm12,0x70(%rsp) 3760 movaps %xmm13,0x80(%rsp) 3761 movaps %xmm14,0x90(%rsp) 3762 movaps %xmm15,0xa0(%rsp) 3763 .Lcbc_decrypt_body: 3764 ___ 3765 3766 my $inp_=$key_="%rbp"; # reassign $key_ 3767 3768 $code.=<<___; 3769 mov $key,$key_ # [re-]backup $key [after reassignment] 3770 movups ($ivp),$iv 3771 mov $rnds_,$rounds 3772 cmp \$0x50,$len 3773 jbe .Lcbc_dec_tail 3774 3775 $movkey ($key),$rndkey0 3776 movdqu 0x00($inp),$inout0 # load input 3777 movdqu 0x10($inp),$inout1 3778 movdqa $inout0,$in0 3779 movdqu 0x20($inp),$inout2 3780 movdqa $inout1,$in1 3781 movdqu 0x30($inp),$inout3 3782 movdqa $inout2,$in2 3783 movdqu 0x40($inp),$inout4 3784 movdqa $inout3,$in3 3785 movdqu 0x50($inp),$inout5 3786 movdqa $inout4,$in4 3787 leaq OPENSSL_ia32cap_P(%rip),%r9 3788 mov 4(%r9),%r9d 3789 cmp \$0x70,$len 3790 jbe .Lcbc_dec_six_or_seven 3791 3792 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3793 sub \$0x50,$len # $len is biased by -5*16 3794 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3795 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3796 sub \$0x20,$len # $len is biased by -7*16 3797 lea 0x70($key),$key # size optimization 3798 jmp .Lcbc_dec_loop8_enter 3799 .align 16 3800 .Lcbc_dec_loop8: 3801 movups $inout7,($out) 3802 lea 0x10($out),$out 3803 .Lcbc_dec_loop8_enter: 3804 movdqu 0x60($inp),$inout6 3805 pxor $rndkey0,$inout0 3806 movdqu 0x70($inp),$inout7 3807 pxor $rndkey0,$inout1 3808 $movkey 0x10-0x70($key),$rndkey1 3809 pxor $rndkey0,$inout2 3810 mov \$-1,$inp_ 3811 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3812 pxor $rndkey0,$inout3 3813 pxor $rndkey0,$inout4 3814 pxor $rndkey0,$inout5 3815 pxor $rndkey0,$inout6 3816 3817 aesdec $rndkey1,$inout0 3818 pxor $rndkey0,$inout7 3819 $movkey 0x20-0x70($key),$rndkey0 3820 aesdec $rndkey1,$inout1 3821 aesdec $rndkey1,$inout2 3822 aesdec $rndkey1,$inout3 3823 aesdec $rndkey1,$inout4 3824 aesdec $rndkey1,$inout5 3825 aesdec $rndkey1,$inout6 3826 adc \$0,$inp_ 3827 and \$128,$inp_ 3828 aesdec $rndkey1,$inout7 3829 add $inp,$inp_ 3830 $movkey 0x30-0x70($key),$rndkey1 3831 ___ 3832 for($i=1;$i<12;$i++) { 3833 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3834 $code.=<<___ if ($i==7); 3835 cmp \$11,$rounds 3836 ___ 3837 $code.=<<___; 3838 aesdec $rndkeyx,$inout0 3839 aesdec $rndkeyx,$inout1 3840 aesdec $rndkeyx,$inout2 3841 aesdec $rndkeyx,$inout3 3842 aesdec $rndkeyx,$inout4 3843 aesdec $rndkeyx,$inout5 3844 aesdec $rndkeyx,$inout6 3845 aesdec $rndkeyx,$inout7 3846 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3847 ___ 3848 $code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3849 nop 3850 ___ 3851 $code.=<<___ if ($i==7); 3852 jb .Lcbc_dec_done 3853 ___ 3854 $code.=<<___ if ($i==9); 3855 je .Lcbc_dec_done 3856 ___ 3857 $code.=<<___ if ($i==11); 3858 jmp .Lcbc_dec_done 3859 ___ 3860 } 3861 $code.=<<___; 3862 .align 16 3863 .Lcbc_dec_done: 3864 aesdec $rndkey1,$inout0 3865 aesdec $rndkey1,$inout1 3866 pxor $rndkey0,$iv 3867 pxor $rndkey0,$in0 3868 aesdec $rndkey1,$inout2 3869 aesdec $rndkey1,$inout3 3870 pxor $rndkey0,$in1 3871 pxor $rndkey0,$in2 3872 aesdec $rndkey1,$inout4 3873 aesdec $rndkey1,$inout5 3874 pxor $rndkey0,$in3 3875 pxor $rndkey0,$in4 3876 aesdec $rndkey1,$inout6 3877 aesdec $rndkey1,$inout7 3878 movdqu 0x50($inp),$rndkey1 3879 3880 aesdeclast $iv,$inout0 3881 movdqu 0x60($inp),$iv # borrow $iv 3882 pxor $rndkey0,$rndkey1 3883 aesdeclast $in0,$inout1 3884 pxor $rndkey0,$iv 3885 movdqu 0x70($inp),$rndkey0 # next IV 3886 aesdeclast $in1,$inout2 3887 lea 0x80($inp),$inp 3888 movdqu 0x00($inp_),$in0 3889 aesdeclast $in2,$inout3 3890 aesdeclast $in3,$inout4 3891 movdqu 0x10($inp_),$in1 3892 movdqu 0x20($inp_),$in2 3893 aesdeclast $in4,$inout5 3894 aesdeclast $rndkey1,$inout6 3895 movdqu 0x30($inp_),$in3 3896 movdqu 0x40($inp_),$in4 3897 aesdeclast $iv,$inout7 3898 movdqa $rndkey0,$iv # return $iv 3899 movdqu 0x50($inp_),$rndkey1 3900 $movkey -0x70($key),$rndkey0 3901 3902 movups $inout0,($out) # store output 3903 movdqa $in0,$inout0 3904 movups $inout1,0x10($out) 3905 movdqa $in1,$inout1 3906 movups $inout2,0x20($out) 3907 movdqa $in2,$inout2 3908 movups $inout3,0x30($out) 3909 movdqa $in3,$inout3 3910 movups $inout4,0x40($out) 3911 movdqa $in4,$inout4 3912 movups $inout5,0x50($out) 3913 movdqa $rndkey1,$inout5 3914 movups $inout6,0x60($out) 3915 lea 0x70($out),$out 3916 3917 sub \$0x80,$len 3918 ja .Lcbc_dec_loop8 3919 3920 movaps $inout7,$inout0 3921 lea -0x70($key),$key 3922 add \$0x70,$len 3923 jle .Lcbc_dec_clear_tail_collected 3924 movups $inout7,($out) 3925 lea 0x10($out),$out 3926 cmp \$0x50,$len 3927 jbe .Lcbc_dec_tail 3928 3929 movaps $in0,$inout0 3930 .Lcbc_dec_six_or_seven: 3931 cmp \$0x60,$len 3932 ja .Lcbc_dec_seven 3933 3934 movaps $inout5,$inout6 3935 call _aesni_decrypt6 3936 pxor $iv,$inout0 # ^= IV 3937 movaps $inout6,$iv 3938 pxor $in0,$inout1 3939 movdqu $inout0,($out) 3940 pxor $in1,$inout2 3941 movdqu $inout1,0x10($out) 3942 pxor $inout1,$inout1 # clear register bank 3943 pxor $in2,$inout3 3944 movdqu $inout2,0x20($out) 3945 pxor $inout2,$inout2 3946 pxor $in3,$inout4 3947 movdqu $inout3,0x30($out) 3948 pxor $inout3,$inout3 3949 pxor $in4,$inout5 3950 movdqu $inout4,0x40($out) 3951 pxor $inout4,$inout4 3952 lea 0x50($out),$out 3953 movdqa $inout5,$inout0 3954 pxor $inout5,$inout5 3955 jmp .Lcbc_dec_tail_collected 3956 3957 .align 16 3958 .Lcbc_dec_seven: 3959 movups 0x60($inp),$inout6 3960 xorps $inout7,$inout7 3961 call _aesni_decrypt8 3962 movups 0x50($inp),$inout7 3963 pxor $iv,$inout0 # ^= IV 3964 movups 0x60($inp),$iv 3965 pxor $in0,$inout1 3966 movdqu $inout0,($out) 3967 pxor $in1,$inout2 3968 movdqu $inout1,0x10($out) 3969 pxor $inout1,$inout1 # clear register bank 3970 pxor $in2,$inout3 3971 movdqu $inout2,0x20($out) 3972 pxor $inout2,$inout2 3973 pxor $in3,$inout4 3974 movdqu $inout3,0x30($out) 3975 pxor $inout3,$inout3 3976 pxor $in4,$inout5 3977 movdqu $inout4,0x40($out) 3978 pxor $inout4,$inout4 3979 pxor $inout7,$inout6 3980 movdqu $inout5,0x50($out) 3981 pxor $inout5,$inout5 3982 lea 0x60($out),$out 3983 movdqa $inout6,$inout0 3984 pxor $inout6,$inout6 3985 pxor $inout7,$inout7 3986 jmp .Lcbc_dec_tail_collected 3987 3988 .align 16 3989 .Lcbc_dec_loop6: 3990 movups $inout5,($out) 3991 lea 0x10($out),$out 3992 movdqu 0x00($inp),$inout0 # load input 3993 movdqu 0x10($inp),$inout1 3994 movdqa $inout0,$in0 3995 movdqu 0x20($inp),$inout2 3996 movdqa $inout1,$in1 3997 movdqu 0x30($inp),$inout3 3998 movdqa $inout2,$in2 3999 movdqu 0x40($inp),$inout4 4000 movdqa $inout3,$in3 4001 movdqu 0x50($inp),$inout5 4002 movdqa $inout4,$in4 4003 .Lcbc_dec_loop6_enter: 4004 lea 0x60($inp),$inp 4005 movdqa $inout5,$inout6 4006 4007 call _aesni_decrypt6 4008 4009 pxor $iv,$inout0 # ^= IV 4010 movdqa $inout6,$iv 4011 pxor $in0,$inout1 4012 movdqu $inout0,($out) 4013 pxor $in1,$inout2 4014 movdqu $inout1,0x10($out) 4015 pxor $in2,$inout3 4016 movdqu $inout2,0x20($out) 4017 pxor $in3,$inout4 4018 mov $key_,$key 4019 movdqu $inout3,0x30($out) 4020 pxor $in4,$inout5 4021 mov $rnds_,$rounds 4022 movdqu $inout4,0x40($out) 4023 lea 0x50($out),$out 4024 sub \$0x60,$len 4025 ja .Lcbc_dec_loop6 4026 4027 movdqa $inout5,$inout0 4028 add \$0x50,$len 4029 jle .Lcbc_dec_clear_tail_collected 4030 movups $inout5,($out) 4031 lea 0x10($out),$out 4032 4033 .Lcbc_dec_tail: 4034 movups ($inp),$inout0 4035 sub \$0x10,$len 4036 jbe .Lcbc_dec_one # $len is 1*16 or less 4037 4038 movups 0x10($inp),$inout1 4039 movaps $inout0,$in0 4040 sub \$0x10,$len 4041 jbe .Lcbc_dec_two # $len is 2*16 or less 4042 4043 movups 0x20($inp),$inout2 4044 movaps $inout1,$in1 4045 sub \$0x10,$len 4046 jbe .Lcbc_dec_three # $len is 3*16 or less 4047 4048 movups 0x30($inp),$inout3 4049 movaps $inout2,$in2 4050 sub \$0x10,$len 4051 jbe .Lcbc_dec_four # $len is 4*16 or less 4052 4053 movups 0x40($inp),$inout4 # $len is 5*16 or less 4054 movaps $inout3,$in3 4055 movaps $inout4,$in4 4056 xorps $inout5,$inout5 4057 call _aesni_decrypt6 4058 pxor $iv,$inout0 4059 movaps $in4,$iv 4060 pxor $in0,$inout1 4061 movdqu $inout0,($out) 4062 pxor $in1,$inout2 4063 movdqu $inout1,0x10($out) 4064 pxor $inout1,$inout1 # clear register bank 4065 pxor $in2,$inout3 4066 movdqu $inout2,0x20($out) 4067 pxor $inout2,$inout2 4068 pxor $in3,$inout4 4069 movdqu $inout3,0x30($out) 4070 pxor $inout3,$inout3 4071 lea 0x40($out),$out 4072 movdqa $inout4,$inout0 4073 pxor $inout4,$inout4 4074 pxor $inout5,$inout5 4075 sub \$0x10,$len 4076 jmp .Lcbc_dec_tail_collected 4077 4078 .align 16 4079 .Lcbc_dec_one: 4080 movaps $inout0,$in0 4081 ___ 4082 &aesni_generate1("dec",$key,$rounds); 4083 $code.=<<___; 4084 xorps $iv,$inout0 4085 movaps $in0,$iv 4086 jmp .Lcbc_dec_tail_collected 4087 .align 16 4088 .Lcbc_dec_two: 4089 movaps $inout1,$in1 4090 call _aesni_decrypt2 4091 pxor $iv,$inout0 4092 movaps $in1,$iv 4093 pxor $in0,$inout1 4094 movdqu $inout0,($out) 4095 movdqa $inout1,$inout0 4096 pxor $inout1,$inout1 # clear register bank 4097 lea 0x10($out),$out 4098 jmp .Lcbc_dec_tail_collected 4099 .align 16 4100 .Lcbc_dec_three: 4101 movaps $inout2,$in2 4102 call _aesni_decrypt3 4103 pxor $iv,$inout0 4104 movaps $in2,$iv 4105 pxor $in0,$inout1 4106 movdqu $inout0,($out) 4107 pxor $in1,$inout2 4108 movdqu $inout1,0x10($out) 4109 pxor $inout1,$inout1 # clear register bank 4110 movdqa $inout2,$inout0 4111 pxor $inout2,$inout2 4112 lea 0x20($out),$out 4113 jmp .Lcbc_dec_tail_collected 4114 .align 16 4115 .Lcbc_dec_four: 4116 movaps $inout3,$in3 4117 call _aesni_decrypt4 4118 pxor $iv,$inout0 4119 movaps $in3,$iv 4120 pxor $in0,$inout1 4121 movdqu $inout0,($out) 4122 pxor $in1,$inout2 4123 movdqu $inout1,0x10($out) 4124 pxor $inout1,$inout1 # clear register bank 4125 pxor $in2,$inout3 4126 movdqu $inout2,0x20($out) 4127 pxor $inout2,$inout2 4128 movdqa $inout3,$inout0 4129 pxor $inout3,$inout3 4130 lea 0x30($out),$out 4131 jmp .Lcbc_dec_tail_collected 4132 4133 .align 16 4134 .Lcbc_dec_clear_tail_collected: 4135 pxor $inout1,$inout1 # clear register bank 4136 pxor $inout2,$inout2 4137 pxor $inout3,$inout3 4138 ___ 4139 $code.=<<___ if (!$win64); 4140 pxor $inout4,$inout4 # %xmm6..9 4141 pxor $inout5,$inout5 4142 pxor $inout6,$inout6 4143 pxor $inout7,$inout7 4144 ___ 4145 $code.=<<___; 4146 .Lcbc_dec_tail_collected: 4147 movups $iv,($ivp) 4148 and \$15,$len 4149 jnz .Lcbc_dec_tail_partial 4150 movups $inout0,($out) 4151 pxor $inout0,$inout0 4152 jmp .Lcbc_dec_ret 4153 .align 16 4154 .Lcbc_dec_tail_partial: 4155 movaps $inout0,(%rsp) 4156 pxor $inout0,$inout0 4157 mov \$16,%rcx 4158 mov $out,%rdi 4159 sub $len,%rcx 4160 lea (%rsp),%rsi 4161 .long 0x9066A4F3 # rep movsb 4162 movdqa $inout0,(%rsp) 4163 4164 .Lcbc_dec_ret: 4165 xorps $rndkey0,$rndkey0 # %xmm0 4166 pxor $rndkey1,$rndkey1 4167 ___ 4168 $code.=<<___ if ($win64); 4169 movaps 0x10(%rsp),%xmm6 4170 movaps %xmm0,0x10(%rsp) # clear stack 4171 movaps 0x20(%rsp),%xmm7 4172 movaps %xmm0,0x20(%rsp) 4173 movaps 0x30(%rsp),%xmm8 4174 movaps %xmm0,0x30(%rsp) 4175 movaps 0x40(%rsp),%xmm9 4176 movaps %xmm0,0x40(%rsp) 4177 movaps 0x50(%rsp),%xmm10 4178 movaps %xmm0,0x50(%rsp) 4179 movaps 0x60(%rsp),%xmm11 4180 movaps %xmm0,0x60(%rsp) 4181 movaps 0x70(%rsp),%xmm12 4182 movaps %xmm0,0x70(%rsp) 4183 movaps 0x80(%rsp),%xmm13 4184 movaps %xmm0,0x80(%rsp) 4185 movaps 0x90(%rsp),%xmm14 4186 movaps %xmm0,0x90(%rsp) 4187 movaps 0xa0(%rsp),%xmm15 4188 movaps %xmm0,0xa0(%rsp) 4189 ___ 4190 $code.=<<___; 4191 mov -8(%r11),%rbp 4192 lea (%r11),%rsp 4193 .Lcbc_ret: 4194 ret 4195 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4196 ___ 4197 } 4199 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4200 # int bits, AES_KEY *key) 4201 # 4202 # input: $inp user-supplied key 4203 # $bits $inp length in bits 4204 # $key pointer to key schedule 4205 # output: %eax 0 denoting success, -1 or -2 - failure (see C) 4206 # *$key key schedule 4207 # 4208 { my ($inp,$bits,$key) = @_4args; 4209 $bits =~ s/%r/%e/; 4210 4211 $code.=<<___; 4212 .globl ${PREFIX}_set_decrypt_key 4213 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4214 .align 16 4215 ${PREFIX}_set_decrypt_key: 4216 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4217 call __aesni_set_encrypt_key 4218 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4219 test %eax,%eax 4220 jnz .Ldec_key_ret 4221 lea 16($key,$bits),$inp # points at the end of key schedule 4222 4223 $movkey ($key),%xmm0 # just swap 4224 $movkey ($inp),%xmm1 4225 $movkey %xmm0,($inp) 4226 $movkey %xmm1,($key) 4227 lea 16($key),$key 4228 lea -16($inp),$inp 4229 4230 .Ldec_key_inverse: 4231 $movkey ($key),%xmm0 # swap and inverse 4232 $movkey ($inp),%xmm1 4233 aesimc %xmm0,%xmm0 4234 aesimc %xmm1,%xmm1 4235 lea 16($key),$key 4236 lea -16($inp),$inp 4237 $movkey %xmm0,16($inp) 4238 $movkey %xmm1,-16($key) 4239 cmp $key,$inp 4240 ja .Ldec_key_inverse 4241 4242 $movkey ($key),%xmm0 # inverse middle 4243 aesimc %xmm0,%xmm0 4244 pxor %xmm1,%xmm1 4245 $movkey %xmm0,($inp) 4246 pxor %xmm0,%xmm0 4247 .Ldec_key_ret: 4248 add \$8,%rsp 4249 ret 4250 .LSEH_end_set_decrypt_key: 4251 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4252 ___ 4253 4255 # This is based on submission by 4256 # 4257 # Huang Ying <ying.huang (at] intel.com> 4258 # Vinodh Gopal <vinodh.gopal (at] intel.com> 4259 # Kahraman Akdemir 4260 # 4261 # Aggressively optimized in respect to aeskeygenassist's critical path 4262 # and is contained in %xmm0-5 to meet Win64 ABI requirement. 4263 # 4264 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4265 # int bits, AES_KEY * const key); 4266 # 4267 # input: $inp user-supplied key 4268 # $bits $inp length in bits 4269 # $key pointer to key schedule 4270 # output: %eax 0 denoting success, -1 or -2 - failure (see C) 4271 # $bits rounds-1 (used in aesni_set_decrypt_key) 4272 # *$key key schedule 4273 # $key pointer to key schedule (used in 4274 # aesni_set_decrypt_key) 4275 # 4276 # Subroutine is frame-less, which means that only volatile registers 4277 # are used. Note that it's declared "abi-omnipotent", which means that 4278 # amount of volatile registers is smaller on Windows. 4279 # 4280 $code.=<<___; 4281 .globl ${PREFIX}_set_encrypt_key 4282 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4283 .align 16 4284 ${PREFIX}_set_encrypt_key: 4285 __aesni_set_encrypt_key: 4286 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4287 mov \$-1,%rax 4288 test $inp,$inp 4289 jz .Lenc_key_ret 4290 test $key,$key 4291 jz .Lenc_key_ret 4292 4293 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4294 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4295 leaq OPENSSL_ia32cap_P(%rip),%r10 4296 movl 4(%r10),%r10d 4297 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4298 lea 16($key),%rax # %rax is used as modifiable copy of $key 4299 cmp \$256,$bits 4300 je .L14rounds 4301 cmp \$192,$bits 4302 je .L12rounds 4303 cmp \$128,$bits 4304 jne .Lbad_keybits 4305 4306 .L10rounds: 4307 mov \$9,$bits # 10 rounds for 128-bit key 4308 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4309 je .L10rounds_alt 4310 4311 $movkey %xmm0,($key) # round 0 4312 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4313 call .Lkey_expansion_128_cold 4314 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4315 call .Lkey_expansion_128 4316 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4317 call .Lkey_expansion_128 4318 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4319 call .Lkey_expansion_128 4320 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4321 call .Lkey_expansion_128 4322 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4323 call .Lkey_expansion_128 4324 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4325 call .Lkey_expansion_128 4326 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4327 call .Lkey_expansion_128 4328 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4329 call .Lkey_expansion_128 4330 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4331 call .Lkey_expansion_128 4332 $movkey %xmm0,(%rax) 4333 mov $bits,80(%rax) # 240(%rdx) 4334 xor %eax,%eax 4335 jmp .Lenc_key_ret 4336 4337 .align 16 4338 .L10rounds_alt: 4339 movdqa .Lkey_rotate(%rip),%xmm5 4340 mov \$8,%r10d 4341 movdqa .Lkey_rcon1(%rip),%xmm4 4342 movdqa %xmm0,%xmm2 4343 movdqu %xmm0,($key) 4344 jmp .Loop_key128 4345 4346 .align 16 4347 .Loop_key128: 4348 pshufb %xmm5,%xmm0 4349 aesenclast %xmm4,%xmm0 4350 pslld \$1,%xmm4 4351 lea 16(%rax),%rax 4352 4353 movdqa %xmm2,%xmm3 4354 pslldq \$4,%xmm2 4355 pxor %xmm2,%xmm3 4356 pslldq \$4,%xmm2 4357 pxor %xmm2,%xmm3 4358 pslldq \$4,%xmm2 4359 pxor %xmm3,%xmm2 4360 4361 pxor %xmm2,%xmm0 4362 movdqu %xmm0,-16(%rax) 4363 movdqa %xmm0,%xmm2 4364 4365 dec %r10d 4366 jnz .Loop_key128 4367 4368 movdqa .Lkey_rcon1b(%rip),%xmm4 4369 4370 pshufb %xmm5,%xmm0 4371 aesenclast %xmm4,%xmm0 4372 pslld \$1,%xmm4 4373 4374 movdqa %xmm2,%xmm3 4375 pslldq \$4,%xmm2 4376 pxor %xmm2,%xmm3 4377 pslldq \$4,%xmm2 4378 pxor %xmm2,%xmm3 4379 pslldq \$4,%xmm2 4380 pxor %xmm3,%xmm2 4381 4382 pxor %xmm2,%xmm0 4383 movdqu %xmm0,(%rax) 4384 4385 movdqa %xmm0,%xmm2 4386 pshufb %xmm5,%xmm0 4387 aesenclast %xmm4,%xmm0 4388 4389 movdqa %xmm2,%xmm3 4390 pslldq \$4,%xmm2 4391 pxor %xmm2,%xmm3 4392 pslldq \$4,%xmm2 4393 pxor %xmm2,%xmm3 4394 pslldq \$4,%xmm2 4395 pxor %xmm3,%xmm2 4396 4397 pxor %xmm2,%xmm0 4398 movdqu %xmm0,16(%rax) 4399 4400 mov $bits,96(%rax) # 240($key) 4401 xor %eax,%eax 4402 jmp .Lenc_key_ret 4403 4404 .align 16 4405 .L12rounds: 4406 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4407 mov \$11,$bits # 12 rounds for 192 4408 cmp \$`1<<28`,%r10d # AVX, but no XOP 4409 je .L12rounds_alt 4410 4411 $movkey %xmm0,($key) # round 0 4412 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4413 call .Lkey_expansion_192a_cold 4414 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4415 call .Lkey_expansion_192b 4416 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4417 call .Lkey_expansion_192a 4418 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4419 call .Lkey_expansion_192b 4420 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4421 call .Lkey_expansion_192a 4422 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4423 call .Lkey_expansion_192b 4424 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4425 call .Lkey_expansion_192a 4426 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4427 call .Lkey_expansion_192b 4428 $movkey %xmm0,(%rax) 4429 mov $bits,48(%rax) # 240(%rdx) 4430 xor %rax, %rax 4431 jmp .Lenc_key_ret 4432 4433 .align 16 4434 .L12rounds_alt: 4435 movdqa .Lkey_rotate192(%rip),%xmm5 4436 movdqa .Lkey_rcon1(%rip),%xmm4 4437 mov \$8,%r10d 4438 movdqu %xmm0,($key) 4439 jmp .Loop_key192 4440 4441 .align 16 4442 .Loop_key192: 4443 movq %xmm2,0(%rax) 4444 movdqa %xmm2,%xmm1 4445 pshufb %xmm5,%xmm2 4446 aesenclast %xmm4,%xmm2 4447 pslld \$1, %xmm4 4448 lea 24(%rax),%rax 4449 4450 movdqa %xmm0,%xmm3 4451 pslldq \$4,%xmm0 4452 pxor %xmm0,%xmm3 4453 pslldq \$4,%xmm0 4454 pxor %xmm0,%xmm3 4455 pslldq \$4,%xmm0 4456 pxor %xmm3,%xmm0 4457 4458 pshufd \$0xff,%xmm0,%xmm3 4459 pxor %xmm1,%xmm3 4460 pslldq \$4,%xmm1 4461 pxor %xmm1,%xmm3 4462 4463 pxor %xmm2,%xmm0 4464 pxor %xmm3,%xmm2 4465 movdqu %xmm0,-16(%rax) 4466 4467 dec %r10d 4468 jnz .Loop_key192 4469 4470 mov $bits,32(%rax) # 240($key) 4471 xor %eax,%eax 4472 jmp .Lenc_key_ret 4473 4474 .align 16 4475 .L14rounds: 4476 movups 16($inp),%xmm2 # remaning half of *userKey 4477 mov \$13,$bits # 14 rounds for 256 4478 lea 16(%rax),%rax 4479 cmp \$`1<<28`,%r10d # AVX, but no XOP 4480 je .L14rounds_alt 4481 4482 $movkey %xmm0,($key) # round 0 4483 $movkey %xmm2,16($key) # round 1 4484 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4485 call .Lkey_expansion_256a_cold 4486 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4487 call .Lkey_expansion_256b 4488 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4489 call .Lkey_expansion_256a 4490 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4491 call .Lkey_expansion_256b 4492 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4493 call .Lkey_expansion_256a 4494 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4495 call .Lkey_expansion_256b 4496 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4497 call .Lkey_expansion_256a 4498 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4499 call .Lkey_expansion_256b 4500 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4501 call .Lkey_expansion_256a 4502 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4503 call .Lkey_expansion_256b 4504 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4505 call .Lkey_expansion_256a 4506 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4507 call .Lkey_expansion_256b 4508 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4509 call .Lkey_expansion_256a 4510 $movkey %xmm0,(%rax) 4511 mov $bits,16(%rax) # 240(%rdx) 4512 xor %rax,%rax 4513 jmp .Lenc_key_ret 4514 4515 .align 16 4516 .L14rounds_alt: 4517 movdqa .Lkey_rotate(%rip),%xmm5 4518 movdqa .Lkey_rcon1(%rip),%xmm4 4519 mov \$7,%r10d 4520 movdqu %xmm0,0($key) 4521 movdqa %xmm2,%xmm1 4522 movdqu %xmm2,16($key) 4523 jmp .Loop_key256 4524 4525 .align 16 4526 .Loop_key256: 4527 pshufb %xmm5,%xmm2 4528 aesenclast %xmm4,%xmm2 4529 4530 movdqa %xmm0,%xmm3 4531 pslldq \$4,%xmm0 4532 pxor %xmm0,%xmm3 4533 pslldq \$4,%xmm0 4534 pxor %xmm0,%xmm3 4535 pslldq \$4,%xmm0 4536 pxor %xmm3,%xmm0 4537 pslld \$1,%xmm4 4538 4539 pxor %xmm2,%xmm0 4540 movdqu %xmm0,(%rax) 4541 4542 dec %r10d 4543 jz .Ldone_key256 4544 4545 pshufd \$0xff,%xmm0,%xmm2 4546 pxor %xmm3,%xmm3 4547 aesenclast %xmm3,%xmm2 4548 4549 movdqa %xmm1,%xmm3 4550 pslldq \$4,%xmm1 4551 pxor %xmm1,%xmm3 4552 pslldq \$4,%xmm1 4553 pxor %xmm1,%xmm3 4554 pslldq \$4,%xmm1 4555 pxor %xmm3,%xmm1 4556 4557 pxor %xmm1,%xmm2 4558 movdqu %xmm2,16(%rax) 4559 lea 32(%rax),%rax 4560 movdqa %xmm2,%xmm1 4561 4562 jmp .Loop_key256 4563 4564 .Ldone_key256: 4565 mov $bits,16(%rax) # 240($key) 4566 xor %eax,%eax 4567 jmp .Lenc_key_ret 4568 4569 .align 16 4570 .Lbad_keybits: 4571 mov \$-2,%rax 4572 .Lenc_key_ret: 4573 pxor %xmm0,%xmm0 4574 pxor %xmm1,%xmm1 4575 pxor %xmm2,%xmm2 4576 pxor %xmm3,%xmm3 4577 pxor %xmm4,%xmm4 4578 pxor %xmm5,%xmm5 4579 add \$8,%rsp 4580 ret 4581 .LSEH_end_set_encrypt_key: 4582 4584 .align 16 4585 .Lkey_expansion_128: 4586 $movkey %xmm0,(%rax) 4587 lea 16(%rax),%rax 4588 .Lkey_expansion_128_cold: 4589 shufps \$0b00010000,%xmm0,%xmm4 4590 xorps %xmm4, %xmm0 4591 shufps \$0b10001100,%xmm0,%xmm4 4592 xorps %xmm4, %xmm0 4593 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4594 xorps %xmm1,%xmm0 4595 ret 4596 4597 .align 16 4598 .Lkey_expansion_192a: 4599 $movkey %xmm0,(%rax) 4600 lea 16(%rax),%rax 4601 .Lkey_expansion_192a_cold: 4602 movaps %xmm2, %xmm5 4603 .Lkey_expansion_192b_warm: 4604 shufps \$0b00010000,%xmm0,%xmm4 4605 movdqa %xmm2,%xmm3 4606 xorps %xmm4,%xmm0 4607 shufps \$0b10001100,%xmm0,%xmm4 4608 pslldq \$4,%xmm3 4609 xorps %xmm4,%xmm0 4610 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4611 pxor %xmm3,%xmm2 4612 pxor %xmm1,%xmm0 4613 pshufd \$0b11111111,%xmm0,%xmm3 4614 pxor %xmm3,%xmm2 4615 ret 4616 4617 .align 16 4618 .Lkey_expansion_192b: 4619 movaps %xmm0,%xmm3 4620 shufps \$0b01000100,%xmm0,%xmm5 4621 $movkey %xmm5,(%rax) 4622 shufps \$0b01001110,%xmm2,%xmm3 4623 $movkey %xmm3,16(%rax) 4624 lea 32(%rax),%rax 4625 jmp .Lkey_expansion_192b_warm 4626 4627 .align 16 4628 .Lkey_expansion_256a: 4629 $movkey %xmm2,(%rax) 4630 lea 16(%rax),%rax 4631 .Lkey_expansion_256a_cold: 4632 shufps \$0b00010000,%xmm0,%xmm4 4633 xorps %xmm4,%xmm0 4634 shufps \$0b10001100,%xmm0,%xmm4 4635 xorps %xmm4,%xmm0 4636 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4637 xorps %xmm1,%xmm0 4638 ret 4639 4640 .align 16 4641 .Lkey_expansion_256b: 4642 $movkey %xmm0,(%rax) 4643 lea 16(%rax),%rax 4644 4645 shufps \$0b00010000,%xmm2,%xmm4 4646 xorps %xmm4,%xmm2 4647 shufps \$0b10001100,%xmm2,%xmm4 4648 xorps %xmm4,%xmm2 4649 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4650 xorps %xmm1,%xmm2 4651 ret 4652 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4653 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4654 ___ 4655 } 4656 4658 $code.=<<___; 4659 .align 64 4660 .Lbswap_mask: 4661 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4662 .Lincrement32: 4663 .long 6,6,6,0 4664 .Lincrement64: 4665 .long 1,0,0,0 4666 .Lxts_magic: 4667 .long 0x87,0,1,0 4668 .Lincrement1: 4669 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4670 .Lkey_rotate: 4671 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4672 .Lkey_rotate192: 4673 .long 0x04070605,0x04070605,0x04070605,0x04070605 4674 .Lkey_rcon1: 4675 .long 1,1,1,1 4676 .Lkey_rcon1b: 4677 .long 0x1b,0x1b,0x1b,0x1b 4678 4679 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4680 .align 64 4681 ___ 4682 4683 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4684 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 4685 if ($win64) { 4686 $rec="%rcx"; 4687 $frame="%rdx"; 4688 $context="%r8"; 4689 $disp="%r9"; 4690 4691 $code.=<<___; 4692 .extern __imp_RtlVirtualUnwind 4693 ___ 4694 $code.=<<___ if ($PREFIX eq "aesni"); 4695 .type ecb_ccm64_se_handler,\@abi-omnipotent 4696 .align 16 4697 ecb_ccm64_se_handler: 4698 push %rsi 4699 push %rdi 4700 push %rbx 4701 push %rbp 4702 push %r12 4703 push %r13 4704 push %r14 4705 push %r15 4706 pushfq 4707 sub \$64,%rsp 4708 4709 mov 120($context),%rax # pull context->Rax 4710 mov 248($context),%rbx # pull context->Rip 4711 4712 mov 8($disp),%rsi # disp->ImageBase 4713 mov 56($disp),%r11 # disp->HandlerData 4714 4715 mov 0(%r11),%r10d # HandlerData[0] 4716 lea (%rsi,%r10),%r10 # prologue label 4717 cmp %r10,%rbx # context->Rip<prologue label 4718 jb .Lcommon_seh_tail 4719 4720 mov 152($context),%rax # pull context->Rsp 4721 4722 mov 4(%r11),%r10d # HandlerData[1] 4723 lea (%rsi,%r10),%r10 # epilogue label 4724 cmp %r10,%rbx # context->Rip>=epilogue label 4725 jae .Lcommon_seh_tail 4726 4727 lea 0(%rax),%rsi # %xmm save area 4728 lea 512($context),%rdi # &context.Xmm6 4729 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4730 .long 0xa548f3fc # cld; rep movsq 4731 lea 0x58(%rax),%rax # adjust stack pointer 4732 4733 jmp .Lcommon_seh_tail 4734 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4735 4736 .type ctr_xts_se_handler,\@abi-omnipotent 4737 .align 16 4738 ctr_xts_se_handler: 4739 push %rsi 4740 push %rdi 4741 push %rbx 4742 push %rbp 4743 push %r12 4744 push %r13 4745 push %r14 4746 push %r15 4747 pushfq 4748 sub \$64,%rsp 4749 4750 mov 120($context),%rax # pull context->Rax 4751 mov 248($context),%rbx # pull context->Rip 4752 4753 mov 8($disp),%rsi # disp->ImageBase 4754 mov 56($disp),%r11 # disp->HandlerData 4755 4756 mov 0(%r11),%r10d # HandlerData[0] 4757 lea (%rsi,%r10),%r10 # prologue lable 4758 cmp %r10,%rbx # context->Rip<prologue label 4759 jb .Lcommon_seh_tail 4760 4761 mov 152($context),%rax # pull context->Rsp 4762 4763 mov 4(%r11),%r10d # HandlerData[1] 4764 lea (%rsi,%r10),%r10 # epilogue label 4765 cmp %r10,%rbx # context->Rip>=epilogue label 4766 jae .Lcommon_seh_tail 4767 4768 mov 208($context),%rax # pull context->R11 4769 4770 lea -0xa8(%rax),%rsi # %xmm save area 4771 lea 512($context),%rdi # & context.Xmm6 4772 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4773 .long 0xa548f3fc # cld; rep movsq 4774 4775 mov -8(%rax),%rbp # restore saved %rbp 4776 mov %rbp,160($context) # restore context->Rbp 4777 jmp .Lcommon_seh_tail 4778 .size ctr_xts_se_handler,.-ctr_xts_se_handler 4779 4780 .type ocb_se_handler,\@abi-omnipotent 4781 .align 16 4782 ocb_se_handler: 4783 push %rsi 4784 push %rdi 4785 push %rbx 4786 push %rbp 4787 push %r12 4788 push %r13 4789 push %r14 4790 push %r15 4791 pushfq 4792 sub \$64,%rsp 4793 4794 mov 120($context),%rax # pull context->Rax 4795 mov 248($context),%rbx # pull context->Rip 4796 4797 mov 8($disp),%rsi # disp->ImageBase 4798 mov 56($disp),%r11 # disp->HandlerData 4799 4800 mov 0(%r11),%r10d # HandlerData[0] 4801 lea (%rsi,%r10),%r10 # prologue lable 4802 cmp %r10,%rbx # context->Rip<prologue label 4803 jb .Lcommon_seh_tail 4804 4805 mov 4(%r11),%r10d # HandlerData[1] 4806 lea (%rsi,%r10),%r10 # epilogue label 4807 cmp %r10,%rbx # context->Rip>=epilogue label 4808 jae .Lcommon_seh_tail 4809 4810 mov 8(%r11),%r10d # HandlerData[2] 4811 lea (%rsi,%r10),%r10 4812 cmp %r10,%rbx # context->Rip>=pop label 4813 jae .Locb_no_xmm 4814 4815 mov 152($context),%rax # pull context->Rsp 4816 4817 lea (%rax),%rsi # %xmm save area 4818 lea 512($context),%rdi # & context.Xmm6 4819 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4820 .long 0xa548f3fc # cld; rep movsq 4821 lea 0xa0+0x28(%rax),%rax 4822 4823 .Locb_no_xmm: 4824 mov -8(%rax),%rbx 4825 mov -16(%rax),%rbp 4826 mov -24(%rax),%r12 4827 mov -32(%rax),%r13 4828 mov -40(%rax),%r14 4829 4830 mov %rbx,144($context) # restore context->Rbx 4831 mov %rbp,160($context) # restore context->Rbp 4832 mov %r12,216($context) # restore context->R12 4833 mov %r13,224($context) # restore context->R13 4834 mov %r14,232($context) # restore context->R14 4835 4836 jmp .Lcommon_seh_tail 4837 .size ocb_se_handler,.-ocb_se_handler 4838 ___ 4839 $code.=<<___; 4840 .type cbc_se_handler,\@abi-omnipotent 4841 .align 16 4842 cbc_se_handler: 4843 push %rsi 4844 push %rdi 4845 push %rbx 4846 push %rbp 4847 push %r12 4848 push %r13 4849 push %r14 4850 push %r15 4851 pushfq 4852 sub \$64,%rsp 4853 4854 mov 152($context),%rax # pull context->Rsp 4855 mov 248($context),%rbx # pull context->Rip 4856 4857 lea .Lcbc_decrypt_bulk(%rip),%r10 4858 cmp %r10,%rbx # context->Rip<"prologue" label 4859 jb .Lcommon_seh_tail 4860 4861 mov 120($context),%rax # pull context->Rax 4862 4863 lea .Lcbc_decrypt_body(%rip),%r10 4864 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4865 jb .Lcommon_seh_tail 4866 4867 mov 152($context),%rax # pull context->Rsp 4868 4869 lea .Lcbc_ret(%rip),%r10 4870 cmp %r10,%rbx # context->Rip>="epilogue" label 4871 jae .Lcommon_seh_tail 4872 4873 lea 16(%rax),%rsi # %xmm save area 4874 lea 512($context),%rdi # &context.Xmm6 4875 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4876 .long 0xa548f3fc # cld; rep movsq 4877 4878 mov 208($context),%rax # pull context->R11 4879 4880 mov -8(%rax),%rbp # restore saved %rbp 4881 mov %rbp,160($context) # restore context->Rbp 4882 4883 .Lcommon_seh_tail: 4884 mov 8(%rax),%rdi 4885 mov 16(%rax),%rsi 4886 mov %rax,152($context) # restore context->Rsp 4887 mov %rsi,168($context) # restore context->Rsi 4888 mov %rdi,176($context) # restore context->Rdi 4889 4890 mov 40($disp),%rdi # disp->ContextRecord 4891 mov $context,%rsi # context 4892 mov \$154,%ecx # sizeof(CONTEXT) 4893 .long 0xa548f3fc # cld; rep movsq 4894 4895 mov $disp,%rsi 4896 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4897 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4898 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4899 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4900 mov 40(%rsi),%r10 # disp->ContextRecord 4901 lea 56(%rsi),%r11 # &disp->HandlerData 4902 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4903 mov %r10,32(%rsp) # arg5 4904 mov %r11,40(%rsp) # arg6 4905 mov %r12,48(%rsp) # arg7 4906 mov %rcx,56(%rsp) # arg8, (NULL) 4907 call *__imp_RtlVirtualUnwind(%rip) 4908 4909 mov \$1,%eax # ExceptionContinueSearch 4910 add \$64,%rsp 4911 popfq 4912 pop %r15 4913 pop %r14 4914 pop %r13 4915 pop %r12 4916 pop %rbp 4917 pop %rbx 4918 pop %rdi 4919 pop %rsi 4920 ret 4921 .size cbc_se_handler,.-cbc_se_handler 4922 4923 .section .pdata 4924 .align 4 4925 ___ 4926 $code.=<<___ if ($PREFIX eq "aesni"); 4927 .rva .LSEH_begin_aesni_ecb_encrypt 4928 .rva .LSEH_end_aesni_ecb_encrypt 4929 .rva .LSEH_info_ecb 4930 4931 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 4932 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 4933 .rva .LSEH_info_ccm64_enc 4934 4935 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 4936 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 4937 .rva .LSEH_info_ccm64_dec 4938 4939 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 4940 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 4941 .rva .LSEH_info_ctr32 4942 4943 .rva .LSEH_begin_aesni_xts_encrypt 4944 .rva .LSEH_end_aesni_xts_encrypt 4945 .rva .LSEH_info_xts_enc 4946 4947 .rva .LSEH_begin_aesni_xts_decrypt 4948 .rva .LSEH_end_aesni_xts_decrypt 4949 .rva .LSEH_info_xts_dec 4950 4951 .rva .LSEH_begin_aesni_ocb_encrypt 4952 .rva .LSEH_end_aesni_ocb_encrypt 4953 .rva .LSEH_info_ocb_enc 4954 4955 .rva .LSEH_begin_aesni_ocb_decrypt 4956 .rva .LSEH_end_aesni_ocb_decrypt 4957 .rva .LSEH_info_ocb_dec 4958 ___ 4959 $code.=<<___; 4960 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 4961 .rva .LSEH_end_${PREFIX}_cbc_encrypt 4962 .rva .LSEH_info_cbc 4963 4964 .rva ${PREFIX}_set_decrypt_key 4965 .rva .LSEH_end_set_decrypt_key 4966 .rva .LSEH_info_key 4967 4968 .rva ${PREFIX}_set_encrypt_key 4969 .rva .LSEH_end_set_encrypt_key 4970 .rva .LSEH_info_key 4971 .section .xdata 4972 .align 8 4973 ___ 4974 $code.=<<___ if ($PREFIX eq "aesni"); 4975 .LSEH_info_ecb: 4976 .byte 9,0,0,0 4977 .rva ecb_ccm64_se_handler 4978 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 4979 .LSEH_info_ccm64_enc: 4980 .byte 9,0,0,0 4981 .rva ecb_ccm64_se_handler 4982 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 4983 .LSEH_info_ccm64_dec: 4984 .byte 9,0,0,0 4985 .rva ecb_ccm64_se_handler 4986 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 4987 .LSEH_info_ctr32: 4988 .byte 9,0,0,0 4989 .rva ctr_xts_se_handler 4990 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 4991 .LSEH_info_xts_enc: 4992 .byte 9,0,0,0 4993 .rva ctr_xts_se_handler 4994 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 4995 .LSEH_info_xts_dec: 4996 .byte 9,0,0,0 4997 .rva ctr_xts_se_handler 4998 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 4999 .LSEH_info_ocb_enc: 5000 .byte 9,0,0,0 5001 .rva ocb_se_handler 5002 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 5003 .rva .Locb_enc_pop 5004 .long 0 5005 .LSEH_info_ocb_dec: 5006 .byte 9,0,0,0 5007 .rva ocb_se_handler 5008 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 5009 .rva .Locb_dec_pop 5010 .long 0 5011 ___ 5012 $code.=<<___; 5013 .LSEH_info_cbc: 5014 .byte 9,0,0,0 5015 .rva cbc_se_handler 5016 .LSEH_info_key: 5017 .byte 0x01,0x04,0x01,0x00 5018 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5019 ___ 5020 } 5021 5022 sub rex { 5023 local *opcode=shift; 5024 my ($dst,$src)=@_; 5025 my $rex=0; 5026 5027 $rex|=0x04 if($dst>=8); 5028 $rex|=0x01 if($src>=8); 5029 push @opcode,$rex|0x40 if($rex); 5030 } 5031 5032 sub aesni { 5033 my $line=shift; 5034 my @opcode=(0x66); 5035 5036 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5037 rex(\@opcode,$4,$3); 5038 push @opcode,0x0f,0x3a,0xdf; 5039 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5040 my $c=$2; 5041 push @opcode,$c=~/^0/?oct($c):$c; 5042 return ".byte\t".join(',',@opcode); 5043 } 5044 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5045 my %opcodelet = ( 5046 "aesimc" => 0xdb, 5047 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5048 "aesdec" => 0xde, "aesdeclast" => 0xdf 5049 ); 5050 return undef if (!defined($opcodelet{$1})); 5051 rex(\@opcode,$3,$2); 5052 push @opcode,0x0f,0x38,$opcodelet{$1}; 5053 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5054 return ".byte\t".join(',',@opcode); 5055 } 5056 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5057 my %opcodelet = ( 5058 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5059 "aesdec" => 0xde, "aesdeclast" => 0xdf 5060 ); 5061 return undef if (!defined($opcodelet{$1})); 5062 my $off = $2; 5063 push @opcode,0x44 if ($3>=8); 5064 push @opcode,0x0f,0x38,$opcodelet{$1}; 5065 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5066 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5067 return ".byte\t".join(',',@opcode); 5068 } 5069 return $line; 5070 } 5071 5072 sub movbe { 5073 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5074 } 5075 5076 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 5077 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5078 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5079 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5080 5081 print $code; 5082 5083 close STDOUT; 5084