1 #!/usr/bin/env perl 2 3 ################################################################### 4 ### AES-128 [originally in CTR mode] ### 5 ### bitsliced implementation for Intel Core 2 processors ### 6 ### requires support of SSE extensions up to SSSE3 ### 7 ### Author: Emilia Ksper and Peter Schwabe ### 8 ### Date: 2009-03-19 ### 9 ### Public domain ### 10 ### ### 11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12 ### further information. ### 13 ################################################################### 14 # 15 # September 2011. 16 # 17 # Started as transliteration to "perlasm" the original code has 18 # undergone following changes: 19 # 20 # - code was made position-independent; 21 # - rounds were folded into a loop resulting in >5x size reduction 22 # from 12.5KB to 2.2KB; 23 # - above was possibile thanks to mixcolumns() modification that 24 # allowed to feed its output back to aesenc[last], this was 25 # achieved at cost of two additional inter-registers moves; 26 # - some instruction reordering and interleaving; 27 # - this module doesn't implement key setup subroutine, instead it 28 # relies on conversion of "conventional" key schedule as returned 29 # by AES_set_encrypt_key (see discussion below); 30 # - first and last round keys are treated differently, which allowed 31 # to skip one shiftrows(), reduce bit-sliced key schedule and 32 # speed-up conversion by 22%; 33 # - support for 192- and 256-bit keys was added; 34 # 35 # Resulting performance in CPU cycles spent to encrypt one byte out 36 # of 4096-byte buffer with 128-bit key is: 37 # 38 # Emilia's this(*) difference 39 # 40 # Core 2 9.30 8.69 +7% 41 # Nehalem(**) 7.63 6.88 +11% 42 # Atom 17.1 16.4 +4% 43 # 44 # (*) Comparison is not completely fair, because "this" is ECB, 45 # i.e. no extra processing such as counter values calculation 46 # and xor-ing input as in Emilia's CTR implementation is 47 # performed. However, the CTR calculations stand for not more 48 # than 1% of total time, so comparison is *rather* fair. 49 # 50 # (**) Results were collected on Westmere, which is considered to 51 # be equivalent to Nehalem for this code. 52 # 53 # As for key schedule conversion subroutine. Interface to OpenSSL 54 # relies on per-invocation on-the-fly conversion. This naturally 55 # has impact on performance, especially for short inputs. Conversion 56 # time in CPU cycles and its ratio to CPU cycles spent in 8x block 57 # function is: 58 # 59 # conversion conversion/8x block 60 # Core 2 240 0.22 61 # Nehalem 180 0.20 62 # Atom 430 0.20 63 # 64 # The ratio values mean that 128-byte blocks will be processed 65 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 66 # etc. Then keep in mind that input sizes not divisible by 128 are 67 # *effectively* slower, especially shortest ones, e.g. consecutive 68 # 144-byte blocks are processed 44% slower than one would expect, 69 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 70 # it's still faster than ["hyper-threading-safe" code path in] 71 # aes-x86_64.pl on all lengths above 64 bytes... 72 # 73 # October 2011. 74 # 75 # Add decryption procedure. Performance in CPU cycles spent to decrypt 76 # one byte out of 4096-byte buffer with 128-bit key is: 77 # 78 # Core 2 9.98 79 # Nehalem 7.80 80 # Atom 17.9 81 # 82 # November 2011. 83 # 84 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 85 # suboptimal, but XTS is meant to be used with larger blocks... 86 # 87 # <appro (at] openssl.org> 88 89 $flavour = shift; 90 $output = shift; 91 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 92 93 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 94 95 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 96 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 97 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 98 die "can't locate x86_64-xlate.pl"; 99 100 open OUT,"| \"$^X\" $xlate $flavour $output"; 101 *STDOUT=*OUT; 102 103 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 104 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 105 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 106 107 { 108 my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 109 110 sub Sbox { 111 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 112 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 113 my @b=@_[0..7]; 114 my @t=@_[8..11]; 115 my @s=@_[12..15]; 116 &InBasisChange (@b); 117 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 118 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 119 } 120 121 sub InBasisChange { 122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 123 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 124 my @b=@_[0..7]; 125 $code.=<<___; 126 pxor @b[6], @b[5] 127 pxor @b[1], @b[2] 128 pxor @b[0], @b[3] 129 pxor @b[2], @b[6] 130 pxor @b[0], @b[5] 131 132 pxor @b[3], @b[6] 133 pxor @b[7], @b[3] 134 pxor @b[5], @b[7] 135 pxor @b[4], @b[3] 136 pxor @b[5], @b[4] 137 pxor @b[1], @b[3] 138 139 pxor @b[7], @b[2] 140 pxor @b[5], @b[1] 141 ___ 142 } 143 144 sub OutBasisChange { 145 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 146 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 147 my @b=@_[0..7]; 148 $code.=<<___; 149 pxor @b[6], @b[0] 150 pxor @b[4], @b[1] 151 pxor @b[0], @b[2] 152 pxor @b[6], @b[4] 153 pxor @b[1], @b[6] 154 155 pxor @b[5], @b[1] 156 pxor @b[3], @b[5] 157 pxor @b[7], @b[3] 158 pxor @b[5], @b[7] 159 pxor @b[5], @b[2] 160 161 pxor @b[7], @b[4] 162 ___ 163 } 164 165 sub InvSbox { 166 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 167 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 168 my @b=@_[0..7]; 169 my @t=@_[8..11]; 170 my @s=@_[12..15]; 171 &InvInBasisChange (@b); 172 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 173 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 174 } 175 176 sub InvInBasisChange { # OutBasisChange in reverse 177 my @b=@_[5,1,2,6,3,7,0,4]; 178 $code.=<<___ 179 pxor @b[7], @b[4] 180 181 pxor @b[5], @b[7] 182 pxor @b[5], @b[2] 183 pxor @b[7], @b[3] 184 pxor @b[3], @b[5] 185 pxor @b[5], @b[1] 186 187 pxor @b[1], @b[6] 188 pxor @b[0], @b[2] 189 pxor @b[6], @b[4] 190 pxor @b[6], @b[0] 191 pxor @b[4], @b[1] 192 ___ 193 } 194 195 sub InvOutBasisChange { # InBasisChange in reverse 196 my @b=@_[2,5,7,3,6,1,0,4]; 197 $code.=<<___; 198 pxor @b[5], @b[1] 199 pxor @b[7], @b[2] 200 201 pxor @b[1], @b[3] 202 pxor @b[5], @b[4] 203 pxor @b[5], @b[7] 204 pxor @b[4], @b[3] 205 pxor @b[0], @b[5] 206 pxor @b[7], @b[3] 207 pxor @b[2], @b[6] 208 pxor @b[1], @b[2] 209 pxor @b[3], @b[6] 210 211 pxor @b[0], @b[3] 212 pxor @b[6], @b[5] 213 ___ 214 } 215 216 sub Mul_GF4 { 217 #;************************************************************* 218 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 219 #;************************************************************* 220 my ($x0,$x1,$y0,$y1,$t0)=@_; 221 $code.=<<___; 222 movdqa $y0, $t0 223 pxor $y1, $t0 224 pand $x0, $t0 225 pxor $x1, $x0 226 pand $y0, $x1 227 pand $y1, $x0 228 pxor $x1, $x0 229 pxor $t0, $x1 230 ___ 231 } 232 233 sub Mul_GF4_N { # not used, see next subroutine 234 # multiply and scale by N 235 my ($x0,$x1,$y0,$y1,$t0)=@_; 236 $code.=<<___; 237 movdqa $y0, $t0 238 pxor $y1, $t0 239 pand $x0, $t0 240 pxor $x1, $x0 241 pand $y0, $x1 242 pand $y1, $x0 243 pxor $x0, $x1 244 pxor $t0, $x0 245 ___ 246 } 247 248 sub Mul_GF4_N_GF4 { 249 # interleaved Mul_GF4_N and Mul_GF4 250 my ($x0,$x1,$y0,$y1,$t0, 251 $x2,$x3,$y2,$y3,$t1)=@_; 252 $code.=<<___; 253 movdqa $y0, $t0 254 movdqa $y2, $t1 255 pxor $y1, $t0 256 pxor $y3, $t1 257 pand $x0, $t0 258 pand $x2, $t1 259 pxor $x1, $x0 260 pxor $x3, $x2 261 pand $y0, $x1 262 pand $y2, $x3 263 pand $y1, $x0 264 pand $y3, $x2 265 pxor $x0, $x1 266 pxor $x3, $x2 267 pxor $t0, $x0 268 pxor $t1, $x3 269 ___ 270 } 271 sub Mul_GF16_2 { 272 my @x=@_[0..7]; 273 my @y=@_[8..11]; 274 my @t=@_[12..15]; 275 $code.=<<___; 276 movdqa @x[0], @t[0] 277 movdqa @x[1], @t[1] 278 ___ 279 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 280 $code.=<<___; 281 pxor @x[2], @t[0] 282 pxor @x[3], @t[1] 283 pxor @y[2], @y[0] 284 pxor @y[3], @y[1] 285 ___ 286 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 287 @x[2], @x[3], @y[2], @y[3], @t[2]); 288 $code.=<<___; 289 pxor @t[0], @x[0] 290 pxor @t[0], @x[2] 291 pxor @t[1], @x[1] 292 pxor @t[1], @x[3] 293 294 movdqa @x[4], @t[0] 295 movdqa @x[5], @t[1] 296 pxor @x[6], @t[0] 297 pxor @x[7], @t[1] 298 ___ 299 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 300 @x[6], @x[7], @y[2], @y[3], @t[2]); 301 $code.=<<___; 302 pxor @y[2], @y[0] 303 pxor @y[3], @y[1] 304 ___ 305 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 306 $code.=<<___; 307 pxor @t[0], @x[4] 308 pxor @t[0], @x[6] 309 pxor @t[1], @x[5] 310 pxor @t[1], @x[7] 311 ___ 312 } 313 sub Inv_GF256 { 314 #;******************************************************************** 315 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 316 #;******************************************************************** 317 my @x=@_[0..7]; 318 my @t=@_[8..11]; 319 my @s=@_[12..15]; 320 # direct optimizations from hardware 321 $code.=<<___; 322 movdqa @x[4], @t[3] 323 movdqa @x[5], @t[2] 324 movdqa @x[1], @t[1] 325 movdqa @x[7], @s[1] 326 movdqa @x[0], @s[0] 327 328 pxor @x[6], @t[3] 329 pxor @x[7], @t[2] 330 pxor @x[3], @t[1] 331 movdqa @t[3], @s[2] 332 pxor @x[6], @s[1] 333 movdqa @t[2], @t[0] 334 pxor @x[2], @s[0] 335 movdqa @t[3], @s[3] 336 337 por @t[1], @t[2] 338 por @s[0], @t[3] 339 pxor @t[0], @s[3] 340 pand @s[0], @s[2] 341 pxor @t[1], @s[0] 342 pand @t[1], @t[0] 343 pand @s[0], @s[3] 344 movdqa @x[3], @s[0] 345 pxor @x[2], @s[0] 346 pand @s[0], @s[1] 347 pxor @s[1], @t[3] 348 pxor @s[1], @t[2] 349 movdqa @x[4], @s[1] 350 movdqa @x[1], @s[0] 351 pxor @x[5], @s[1] 352 pxor @x[0], @s[0] 353 movdqa @s[1], @t[1] 354 pand @s[0], @s[1] 355 por @s[0], @t[1] 356 pxor @s[1], @t[0] 357 pxor @s[3], @t[3] 358 pxor @s[2], @t[2] 359 pxor @s[3], @t[1] 360 movdqa @x[7], @s[0] 361 pxor @s[2], @t[0] 362 movdqa @x[6], @s[1] 363 pxor @s[2], @t[1] 364 movdqa @x[5], @s[2] 365 pand @x[3], @s[0] 366 movdqa @x[4], @s[3] 367 pand @x[2], @s[1] 368 pand @x[1], @s[2] 369 por @x[0], @s[3] 370 pxor @s[0], @t[3] 371 pxor @s[1], @t[2] 372 pxor @s[2], @t[1] 373 pxor @s[3], @t[0] 374 375 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 376 377 # new smaller inversion 378 379 movdqa @t[3], @s[0] 380 pand @t[1], @t[3] 381 pxor @t[2], @s[0] 382 383 movdqa @t[0], @s[2] 384 movdqa @s[0], @s[3] 385 pxor @t[3], @s[2] 386 pand @s[2], @s[3] 387 388 movdqa @t[1], @s[1] 389 pxor @t[2], @s[3] 390 pxor @t[0], @s[1] 391 392 pxor @t[2], @t[3] 393 394 pand @t[3], @s[1] 395 396 movdqa @s[2], @t[2] 397 pxor @t[0], @s[1] 398 399 pxor @s[1], @t[2] 400 pxor @s[1], @t[1] 401 402 pand @t[0], @t[2] 403 404 pxor @t[2], @s[2] 405 pxor @t[2], @t[1] 406 407 pand @s[3], @s[2] 408 409 pxor @s[0], @s[2] 410 ___ 411 # output in s3, s2, s1, t1 412 413 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 414 415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 416 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 417 418 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 419 } 420 421 # AES linear components 422 423 sub ShiftRows { 424 my @x=@_[0..7]; 425 my $mask=pop; 426 $code.=<<___; 427 pxor 0x00($key),@x[0] 428 pxor 0x10($key),@x[1] 429 pxor 0x20($key),@x[2] 430 pxor 0x30($key),@x[3] 431 pshufb $mask,@x[0] 432 pshufb $mask,@x[1] 433 pxor 0x40($key),@x[4] 434 pxor 0x50($key),@x[5] 435 pshufb $mask,@x[2] 436 pshufb $mask,@x[3] 437 pxor 0x60($key),@x[6] 438 pxor 0x70($key),@x[7] 439 pshufb $mask,@x[4] 440 pshufb $mask,@x[5] 441 pshufb $mask,@x[6] 442 pshufb $mask,@x[7] 443 lea 0x80($key),$key 444 ___ 445 } 446 447 sub MixColumns { 448 # modified to emit output in order suitable for feeding back to aesenc[last] 449 my @x=@_[0..7]; 450 my @t=@_[8..15]; 451 my $inv=@_[16]; # optional 452 $code.=<<___; 453 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 454 pshufd \$0x93, @x[1], @t[1] 455 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 456 pshufd \$0x93, @x[2], @t[2] 457 pxor @t[1], @x[1] 458 pshufd \$0x93, @x[3], @t[3] 459 pxor @t[2], @x[2] 460 pshufd \$0x93, @x[4], @t[4] 461 pxor @t[3], @x[3] 462 pshufd \$0x93, @x[5], @t[5] 463 pxor @t[4], @x[4] 464 pshufd \$0x93, @x[6], @t[6] 465 pxor @t[5], @x[5] 466 pshufd \$0x93, @x[7], @t[7] 467 pxor @t[6], @x[6] 468 pxor @t[7], @x[7] 469 470 pxor @x[0], @t[1] 471 pxor @x[7], @t[0] 472 pxor @x[7], @t[1] 473 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 474 pxor @x[1], @t[2] 475 pshufd \$0x4E, @x[1], @x[1] 476 pxor @x[4], @t[5] 477 pxor @t[0], @x[0] 478 pxor @x[5], @t[6] 479 pxor @t[1], @x[1] 480 pxor @x[3], @t[4] 481 pshufd \$0x4E, @x[4], @t[0] 482 pxor @x[6], @t[7] 483 pshufd \$0x4E, @x[5], @t[1] 484 pxor @x[2], @t[3] 485 pshufd \$0x4E, @x[3], @x[4] 486 pxor @x[7], @t[3] 487 pshufd \$0x4E, @x[7], @x[5] 488 pxor @x[7], @t[4] 489 pshufd \$0x4E, @x[6], @x[3] 490 pxor @t[4], @t[0] 491 pshufd \$0x4E, @x[2], @x[6] 492 pxor @t[5], @t[1] 493 ___ 494 $code.=<<___ if (!$inv); 495 pxor @t[3], @x[4] 496 pxor @t[7], @x[5] 497 pxor @t[6], @x[3] 498 movdqa @t[0], @x[2] 499 pxor @t[2], @x[6] 500 movdqa @t[1], @x[7] 501 ___ 502 $code.=<<___ if ($inv); 503 pxor @x[4], @t[3] 504 pxor @t[7], @x[5] 505 pxor @x[3], @t[6] 506 movdqa @t[0], @x[3] 507 pxor @t[2], @x[6] 508 movdqa @t[6], @x[2] 509 movdqa @t[1], @x[7] 510 movdqa @x[6], @x[4] 511 movdqa @t[3], @x[6] 512 ___ 513 } 514 515 sub InvMixColumns_orig { 516 my @x=@_[0..7]; 517 my @t=@_[8..15]; 518 519 $code.=<<___; 520 # multiplication by 0x0e 521 pshufd \$0x93, @x[7], @t[7] 522 movdqa @x[2], @t[2] 523 pxor @x[5], @x[7] # 7 5 524 pxor @x[5], @x[2] # 2 5 525 pshufd \$0x93, @x[0], @t[0] 526 movdqa @x[5], @t[5] 527 pxor @x[0], @x[5] # 5 0 [1] 528 pxor @x[1], @x[0] # 0 1 529 pshufd \$0x93, @x[1], @t[1] 530 pxor @x[2], @x[1] # 1 25 531 pxor @x[6], @x[0] # 01 6 [2] 532 pxor @x[3], @x[1] # 125 3 [4] 533 pshufd \$0x93, @x[3], @t[3] 534 pxor @x[0], @x[2] # 25 016 [3] 535 pxor @x[7], @x[3] # 3 75 536 pxor @x[6], @x[7] # 75 6 [0] 537 pshufd \$0x93, @x[6], @t[6] 538 movdqa @x[4], @t[4] 539 pxor @x[4], @x[6] # 6 4 540 pxor @x[3], @x[4] # 4 375 [6] 541 pxor @x[7], @x[3] # 375 756=36 542 pxor @t[5], @x[6] # 64 5 [7] 543 pxor @t[2], @x[3] # 36 2 544 pxor @t[4], @x[3] # 362 4 [5] 545 pshufd \$0x93, @t[5], @t[5] 546 ___ 547 my @y = @x[7,5,0,2,1,3,4,6]; 548 $code.=<<___; 549 # multiplication by 0x0b 550 pxor @y[0], @y[1] 551 pxor @t[0], @y[0] 552 pxor @t[1], @y[1] 553 pshufd \$0x93, @t[2], @t[2] 554 pxor @t[5], @y[0] 555 pxor @t[6], @y[1] 556 pxor @t[7], @y[0] 557 pshufd \$0x93, @t[4], @t[4] 558 pxor @t[6], @t[7] # clobber t[7] 559 pxor @y[0], @y[1] 560 561 pxor @t[0], @y[3] 562 pshufd \$0x93, @t[0], @t[0] 563 pxor @t[1], @y[2] 564 pxor @t[1], @y[4] 565 pxor @t[2], @y[2] 566 pshufd \$0x93, @t[1], @t[1] 567 pxor @t[2], @y[3] 568 pxor @t[2], @y[5] 569 pxor @t[7], @y[2] 570 pshufd \$0x93, @t[2], @t[2] 571 pxor @t[3], @y[3] 572 pxor @t[3], @y[6] 573 pxor @t[3], @y[4] 574 pshufd \$0x93, @t[3], @t[3] 575 pxor @t[4], @y[7] 576 pxor @t[4], @y[5] 577 pxor @t[7], @y[7] 578 pxor @t[5], @y[3] 579 pxor @t[4], @y[4] 580 pxor @t[5], @t[7] # clobber t[7] even more 581 582 pxor @t[7], @y[5] 583 pshufd \$0x93, @t[4], @t[4] 584 pxor @t[7], @y[6] 585 pxor @t[7], @y[4] 586 587 pxor @t[5], @t[7] 588 pshufd \$0x93, @t[5], @t[5] 589 pxor @t[6], @t[7] # restore t[7] 590 591 # multiplication by 0x0d 592 pxor @y[7], @y[4] 593 pxor @t[4], @y[7] 594 pshufd \$0x93, @t[6], @t[6] 595 pxor @t[0], @y[2] 596 pxor @t[5], @y[7] 597 pxor @t[2], @y[2] 598 pshufd \$0x93, @t[7], @t[7] 599 600 pxor @y[1], @y[3] 601 pxor @t[1], @y[1] 602 pxor @t[0], @y[0] 603 pxor @t[0], @y[3] 604 pxor @t[5], @y[1] 605 pxor @t[5], @y[0] 606 pxor @t[7], @y[1] 607 pshufd \$0x93, @t[0], @t[0] 608 pxor @t[6], @y[0] 609 pxor @y[1], @y[3] 610 pxor @t[1], @y[4] 611 pshufd \$0x93, @t[1], @t[1] 612 613 pxor @t[7], @y[7] 614 pxor @t[2], @y[4] 615 pxor @t[2], @y[5] 616 pshufd \$0x93, @t[2], @t[2] 617 pxor @t[6], @y[2] 618 pxor @t[3], @t[6] # clobber t[6] 619 pxor @y[7], @y[4] 620 pxor @t[6], @y[3] 621 622 pxor @t[6], @y[6] 623 pxor @t[5], @y[5] 624 pxor @t[4], @y[6] 625 pshufd \$0x93, @t[4], @t[4] 626 pxor @t[6], @y[5] 627 pxor @t[7], @y[6] 628 pxor @t[3], @t[6] # restore t[6] 629 630 pshufd \$0x93, @t[5], @t[5] 631 pshufd \$0x93, @t[6], @t[6] 632 pshufd \$0x93, @t[7], @t[7] 633 pshufd \$0x93, @t[3], @t[3] 634 635 # multiplication by 0x09 636 pxor @y[1], @y[4] 637 pxor @y[1], @t[1] # t[1]=y[1] 638 pxor @t[5], @t[0] # clobber t[0] 639 pxor @t[5], @t[1] 640 pxor @t[0], @y[3] 641 pxor @y[0], @t[0] # t[0]=y[0] 642 pxor @t[6], @t[1] 643 pxor @t[7], @t[6] # clobber t[6] 644 pxor @t[1], @y[4] 645 pxor @t[4], @y[7] 646 pxor @y[4], @t[4] # t[4]=y[4] 647 pxor @t[3], @y[6] 648 pxor @y[3], @t[3] # t[3]=y[3] 649 pxor @t[2], @y[5] 650 pxor @y[2], @t[2] # t[2]=y[2] 651 pxor @t[7], @t[3] 652 pxor @y[5], @t[5] # t[5]=y[5] 653 pxor @t[6], @t[2] 654 pxor @t[6], @t[5] 655 pxor @y[6], @t[6] # t[6]=y[6] 656 pxor @y[7], @t[7] # t[7]=y[7] 657 658 movdqa @t[0],@XMM[0] 659 movdqa @t[1],@XMM[1] 660 movdqa @t[2],@XMM[2] 661 movdqa @t[3],@XMM[3] 662 movdqa @t[4],@XMM[4] 663 movdqa @t[5],@XMM[5] 664 movdqa @t[6],@XMM[6] 665 movdqa @t[7],@XMM[7] 666 ___ 667 } 668 669 sub InvMixColumns { 670 my @x=@_[0..7]; 671 my @t=@_[8..15]; 672 673 # Thanks to Jussi Kivilinna for providing pointer to 674 # 675 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 676 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 677 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 678 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 679 680 $code.=<<___; 681 # multiplication by 0x05-0x00-0x04-0x00 682 pshufd \$0x4E, @x[0], @t[0] 683 pshufd \$0x4E, @x[6], @t[6] 684 pxor @x[0], @t[0] 685 pshufd \$0x4E, @x[7], @t[7] 686 pxor @x[6], @t[6] 687 pshufd \$0x4E, @x[1], @t[1] 688 pxor @x[7], @t[7] 689 pshufd \$0x4E, @x[2], @t[2] 690 pxor @x[1], @t[1] 691 pshufd \$0x4E, @x[3], @t[3] 692 pxor @x[2], @t[2] 693 pxor @t[6], @x[0] 694 pxor @t[6], @x[1] 695 pshufd \$0x4E, @x[4], @t[4] 696 pxor @x[3], @t[3] 697 pxor @t[0], @x[2] 698 pxor @t[1], @x[3] 699 pshufd \$0x4E, @x[5], @t[5] 700 pxor @x[4], @t[4] 701 pxor @t[7], @x[1] 702 pxor @t[2], @x[4] 703 pxor @x[5], @t[5] 704 705 pxor @t[7], @x[2] 706 pxor @t[6], @x[3] 707 pxor @t[6], @x[4] 708 pxor @t[3], @x[5] 709 pxor @t[4], @x[6] 710 pxor @t[7], @x[4] 711 pxor @t[7], @x[5] 712 pxor @t[5], @x[7] 713 ___ 714 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 715 } 716 717 sub aesenc { # not used 718 my @b=@_[0..7]; 719 my @t=@_[8..15]; 720 $code.=<<___; 721 movdqa 0x30($const),@t[0] # .LSR 722 ___ 723 &ShiftRows (@b,@t[0]); 724 &Sbox (@b,@t); 725 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 726 } 727 728 sub aesenclast { # not used 729 my @b=@_[0..7]; 730 my @t=@_[8..15]; 731 $code.=<<___; 732 movdqa 0x40($const),@t[0] # .LSRM0 733 ___ 734 &ShiftRows (@b,@t[0]); 735 &Sbox (@b,@t); 736 $code.=<<___ 737 pxor 0x00($key),@b[0] 738 pxor 0x10($key),@b[1] 739 pxor 0x20($key),@b[4] 740 pxor 0x30($key),@b[6] 741 pxor 0x40($key),@b[3] 742 pxor 0x50($key),@b[7] 743 pxor 0x60($key),@b[2] 744 pxor 0x70($key),@b[5] 745 ___ 746 } 747 748 sub swapmove { 749 my ($a,$b,$n,$mask,$t)=@_; 750 $code.=<<___; 751 movdqa $b,$t 752 psrlq \$$n,$b 753 pxor $a,$b 754 pand $mask,$b 755 pxor $b,$a 756 psllq \$$n,$b 757 pxor $t,$b 758 ___ 759 } 760 sub swapmove2x { 761 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 762 $code.=<<___; 763 movdqa $b0,$t0 764 psrlq \$$n,$b0 765 movdqa $b1,$t1 766 psrlq \$$n,$b1 767 pxor $a0,$b0 768 pxor $a1,$b1 769 pand $mask,$b0 770 pand $mask,$b1 771 pxor $b0,$a0 772 psllq \$$n,$b0 773 pxor $b1,$a1 774 psllq \$$n,$b1 775 pxor $t0,$b0 776 pxor $t1,$b1 777 ___ 778 } 779 780 sub bitslice { 781 my @x=reverse(@_[0..7]); 782 my ($t0,$t1,$t2,$t3)=@_[8..11]; 783 $code.=<<___; 784 movdqa 0x00($const),$t0 # .LBS0 785 movdqa 0x10($const),$t1 # .LBS1 786 ___ 787 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 788 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 789 $code.=<<___; 790 movdqa 0x20($const),$t0 # .LBS2 791 ___ 792 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 793 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 794 795 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 796 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 797 } 798 799 $code.=<<___; 800 .text 801 802 .extern asm_AES_encrypt 803 .extern asm_AES_decrypt 804 805 .type _bsaes_encrypt8,\@abi-omnipotent 806 .align 64 807 _bsaes_encrypt8: 808 lea .LBS0(%rip), $const # constants table 809 810 movdqa ($key), @XMM[9] # round 0 key 811 lea 0x10($key), $key 812 movdqa 0x50($const), @XMM[8] # .LM0SR 813 pxor @XMM[9], @XMM[0] # xor with round0 key 814 pxor @XMM[9], @XMM[1] 815 pxor @XMM[9], @XMM[2] 816 pxor @XMM[9], @XMM[3] 817 pshufb @XMM[8], @XMM[0] 818 pshufb @XMM[8], @XMM[1] 819 pxor @XMM[9], @XMM[4] 820 pxor @XMM[9], @XMM[5] 821 pshufb @XMM[8], @XMM[2] 822 pshufb @XMM[8], @XMM[3] 823 pxor @XMM[9], @XMM[6] 824 pxor @XMM[9], @XMM[7] 825 pshufb @XMM[8], @XMM[4] 826 pshufb @XMM[8], @XMM[5] 827 pshufb @XMM[8], @XMM[6] 828 pshufb @XMM[8], @XMM[7] 829 _bsaes_encrypt8_bitslice: 830 ___ 831 &bitslice (@XMM[0..7, 8..11]); 832 $code.=<<___; 833 dec $rounds 834 jmp .Lenc_sbox 835 .align 16 836 .Lenc_loop: 837 ___ 838 &ShiftRows (@XMM[0..7, 8]); 839 $code.=".Lenc_sbox:\n"; 840 &Sbox (@XMM[0..7, 8..15]); 841 $code.=<<___; 842 dec $rounds 843 jl .Lenc_done 844 ___ 845 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 846 $code.=<<___; 847 movdqa 0x30($const), @XMM[8] # .LSR 848 jnz .Lenc_loop 849 movdqa 0x40($const), @XMM[8] # .LSRM0 850 jmp .Lenc_loop 851 .align 16 852 .Lenc_done: 853 ___ 854 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 855 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 856 $code.=<<___; 857 movdqa ($key), @XMM[8] # last round key 858 pxor @XMM[8], @XMM[4] 859 pxor @XMM[8], @XMM[6] 860 pxor @XMM[8], @XMM[3] 861 pxor @XMM[8], @XMM[7] 862 pxor @XMM[8], @XMM[2] 863 pxor @XMM[8], @XMM[5] 864 pxor @XMM[8], @XMM[0] 865 pxor @XMM[8], @XMM[1] 866 ret 867 .size _bsaes_encrypt8,.-_bsaes_encrypt8 868 869 .type _bsaes_decrypt8,\@abi-omnipotent 870 .align 64 871 _bsaes_decrypt8: 872 lea .LBS0(%rip), $const # constants table 873 874 movdqa ($key), @XMM[9] # round 0 key 875 lea 0x10($key), $key 876 movdqa -0x30($const), @XMM[8] # .LM0ISR 877 pxor @XMM[9], @XMM[0] # xor with round0 key 878 pxor @XMM[9], @XMM[1] 879 pxor @XMM[9], @XMM[2] 880 pxor @XMM[9], @XMM[3] 881 pshufb @XMM[8], @XMM[0] 882 pshufb @XMM[8], @XMM[1] 883 pxor @XMM[9], @XMM[4] 884 pxor @XMM[9], @XMM[5] 885 pshufb @XMM[8], @XMM[2] 886 pshufb @XMM[8], @XMM[3] 887 pxor @XMM[9], @XMM[6] 888 pxor @XMM[9], @XMM[7] 889 pshufb @XMM[8], @XMM[4] 890 pshufb @XMM[8], @XMM[5] 891 pshufb @XMM[8], @XMM[6] 892 pshufb @XMM[8], @XMM[7] 893 ___ 894 &bitslice (@XMM[0..7, 8..11]); 895 $code.=<<___; 896 dec $rounds 897 jmp .Ldec_sbox 898 .align 16 899 .Ldec_loop: 900 ___ 901 &ShiftRows (@XMM[0..7, 8]); 902 $code.=".Ldec_sbox:\n"; 903 &InvSbox (@XMM[0..7, 8..15]); 904 $code.=<<___; 905 dec $rounds 906 jl .Ldec_done 907 ___ 908 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 909 $code.=<<___; 910 movdqa -0x10($const), @XMM[8] # .LISR 911 jnz .Ldec_loop 912 movdqa -0x20($const), @XMM[8] # .LISRM0 913 jmp .Ldec_loop 914 .align 16 915 .Ldec_done: 916 ___ 917 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 918 $code.=<<___; 919 movdqa ($key), @XMM[8] # last round key 920 pxor @XMM[8], @XMM[6] 921 pxor @XMM[8], @XMM[4] 922 pxor @XMM[8], @XMM[2] 923 pxor @XMM[8], @XMM[7] 924 pxor @XMM[8], @XMM[3] 925 pxor @XMM[8], @XMM[5] 926 pxor @XMM[8], @XMM[0] 927 pxor @XMM[8], @XMM[1] 928 ret 929 .size _bsaes_decrypt8,.-_bsaes_decrypt8 930 ___ 931 } 932 { 933 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 934 935 sub bitslice_key { 936 my @x=reverse(@_[0..7]); 937 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 938 939 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 940 $code.=<<___; 941 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 942 movdqa @x[0], @x[2] 943 movdqa @x[1], @x[3] 944 ___ 945 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 946 947 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 948 $code.=<<___; 949 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 950 movdqa @x[0], @x[4] 951 movdqa @x[2], @x[6] 952 movdqa @x[1], @x[5] 953 movdqa @x[3], @x[7] 954 ___ 955 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 956 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 957 } 958 959 $code.=<<___; 960 .type _bsaes_key_convert,\@abi-omnipotent 961 .align 16 962 _bsaes_key_convert: 963 lea .Lmasks(%rip), $const 964 movdqu ($inp), %xmm7 # load round 0 key 965 lea 0x10($inp), $inp 966 movdqa 0x00($const), %xmm0 # 0x01... 967 movdqa 0x10($const), %xmm1 # 0x02... 968 movdqa 0x20($const), %xmm2 # 0x04... 969 movdqa 0x30($const), %xmm3 # 0x08... 970 movdqa 0x40($const), %xmm4 # .LM0 971 pcmpeqd %xmm5, %xmm5 # .LNOT 972 973 movdqu ($inp), %xmm6 # load round 1 key 974 movdqa %xmm7, ($out) # save round 0 key 975 lea 0x10($out), $out 976 dec $rounds 977 jmp .Lkey_loop 978 .align 16 979 .Lkey_loop: 980 pshufb %xmm4, %xmm6 # .LM0 981 982 movdqa %xmm0, %xmm8 983 movdqa %xmm1, %xmm9 984 985 pand %xmm6, %xmm8 986 pand %xmm6, %xmm9 987 movdqa %xmm2, %xmm10 988 pcmpeqb %xmm0, %xmm8 989 psllq \$4, %xmm0 # 0x10... 990 movdqa %xmm3, %xmm11 991 pcmpeqb %xmm1, %xmm9 992 psllq \$4, %xmm1 # 0x20... 993 994 pand %xmm6, %xmm10 995 pand %xmm6, %xmm11 996 movdqa %xmm0, %xmm12 997 pcmpeqb %xmm2, %xmm10 998 psllq \$4, %xmm2 # 0x40... 999 movdqa %xmm1, %xmm13 1000 pcmpeqb %xmm3, %xmm11 1001 psllq \$4, %xmm3 # 0x80... 1002 1003 movdqa %xmm2, %xmm14 1004 movdqa %xmm3, %xmm15 1005 pxor %xmm5, %xmm8 # "pnot" 1006 pxor %xmm5, %xmm9 1007 1008 pand %xmm6, %xmm12 1009 pand %xmm6, %xmm13 1010 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1011 pcmpeqb %xmm0, %xmm12 1012 psrlq \$4, %xmm0 # 0x01... 1013 movdqa %xmm9, 0x10($out) 1014 pcmpeqb %xmm1, %xmm13 1015 psrlq \$4, %xmm1 # 0x02... 1016 lea 0x10($inp), $inp 1017 1018 pand %xmm6, %xmm14 1019 pand %xmm6, %xmm15 1020 movdqa %xmm10, 0x20($out) 1021 pcmpeqb %xmm2, %xmm14 1022 psrlq \$4, %xmm2 # 0x04... 1023 movdqa %xmm11, 0x30($out) 1024 pcmpeqb %xmm3, %xmm15 1025 psrlq \$4, %xmm3 # 0x08... 1026 movdqu ($inp), %xmm6 # load next round key 1027 1028 pxor %xmm5, %xmm13 # "pnot" 1029 pxor %xmm5, %xmm14 1030 movdqa %xmm12, 0x40($out) 1031 movdqa %xmm13, 0x50($out) 1032 movdqa %xmm14, 0x60($out) 1033 movdqa %xmm15, 0x70($out) 1034 lea 0x80($out),$out 1035 dec $rounds 1036 jnz .Lkey_loop 1037 1038 movdqa 0x50($const), %xmm7 # .L63 1039 #movdqa %xmm6, ($out) # don't save last round key 1040 ret 1041 .size _bsaes_key_convert,.-_bsaes_key_convert 1042 ___ 1043 } 1044 1045 if (0 && !$win64) { # following four functions are unsupported interface 1046 # used for benchmarking... 1047 $code.=<<___; 1048 .globl bsaes_enc_key_convert 1049 .type bsaes_enc_key_convert,\@function,2 1050 .align 16 1051 bsaes_enc_key_convert: 1052 mov 240($inp),%r10d # pass rounds 1053 mov $inp,%rcx # pass key 1054 mov $out,%rax # pass key schedule 1055 call _bsaes_key_convert 1056 pxor %xmm6,%xmm7 # fix up last round key 1057 movdqa %xmm7,(%rax) # save last round key 1058 ret 1059 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1060 1061 .globl bsaes_encrypt_128 1062 .type bsaes_encrypt_128,\@function,4 1063 .align 16 1064 bsaes_encrypt_128: 1065 .Lenc128_loop: 1066 movdqu 0x00($inp), @XMM[0] # load input 1067 movdqu 0x10($inp), @XMM[1] 1068 movdqu 0x20($inp), @XMM[2] 1069 movdqu 0x30($inp), @XMM[3] 1070 movdqu 0x40($inp), @XMM[4] 1071 movdqu 0x50($inp), @XMM[5] 1072 movdqu 0x60($inp), @XMM[6] 1073 movdqu 0x70($inp), @XMM[7] 1074 mov $key, %rax # pass the $key 1075 lea 0x80($inp), $inp 1076 mov \$10,%r10d 1077 1078 call _bsaes_encrypt8 1079 1080 movdqu @XMM[0], 0x00($out) # write output 1081 movdqu @XMM[1], 0x10($out) 1082 movdqu @XMM[4], 0x20($out) 1083 movdqu @XMM[6], 0x30($out) 1084 movdqu @XMM[3], 0x40($out) 1085 movdqu @XMM[7], 0x50($out) 1086 movdqu @XMM[2], 0x60($out) 1087 movdqu @XMM[5], 0x70($out) 1088 lea 0x80($out), $out 1089 sub \$0x80,$len 1090 ja .Lenc128_loop 1091 ret 1092 .size bsaes_encrypt_128,.-bsaes_encrypt_128 1093 1094 .globl bsaes_dec_key_convert 1095 .type bsaes_dec_key_convert,\@function,2 1096 .align 16 1097 bsaes_dec_key_convert: 1098 mov 240($inp),%r10d # pass rounds 1099 mov $inp,%rcx # pass key 1100 mov $out,%rax # pass key schedule 1101 call _bsaes_key_convert 1102 pxor ($out),%xmm7 # fix up round 0 key 1103 movdqa %xmm6,(%rax) # save last round key 1104 movdqa %xmm7,($out) 1105 ret 1106 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1107 1108 .globl bsaes_decrypt_128 1109 .type bsaes_decrypt_128,\@function,4 1110 .align 16 1111 bsaes_decrypt_128: 1112 .Ldec128_loop: 1113 movdqu 0x00($inp), @XMM[0] # load input 1114 movdqu 0x10($inp), @XMM[1] 1115 movdqu 0x20($inp), @XMM[2] 1116 movdqu 0x30($inp), @XMM[3] 1117 movdqu 0x40($inp), @XMM[4] 1118 movdqu 0x50($inp), @XMM[5] 1119 movdqu 0x60($inp), @XMM[6] 1120 movdqu 0x70($inp), @XMM[7] 1121 mov $key, %rax # pass the $key 1122 lea 0x80($inp), $inp 1123 mov \$10,%r10d 1124 1125 call _bsaes_decrypt8 1126 1127 movdqu @XMM[0], 0x00($out) # write output 1128 movdqu @XMM[1], 0x10($out) 1129 movdqu @XMM[6], 0x20($out) 1130 movdqu @XMM[4], 0x30($out) 1131 movdqu @XMM[2], 0x40($out) 1132 movdqu @XMM[7], 0x50($out) 1133 movdqu @XMM[3], 0x60($out) 1134 movdqu @XMM[5], 0x70($out) 1135 lea 0x80($out), $out 1136 sub \$0x80,$len 1137 ja .Ldec128_loop 1138 ret 1139 .size bsaes_decrypt_128,.-bsaes_decrypt_128 1140 ___ 1141 } 1142 { 1143 ###################################################################### 1144 # 1145 # OpenSSL interface 1146 # 1147 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1148 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1149 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1150 1151 if ($ecb) { 1152 $code.=<<___; 1153 .globl bsaes_ecb_encrypt_blocks 1154 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1155 .align 16 1156 bsaes_ecb_encrypt_blocks: 1157 mov %rsp, %rax 1158 .Lecb_enc_prologue: 1159 push %rbp 1160 push %rbx 1161 push %r12 1162 push %r13 1163 push %r14 1164 push %r15 1165 lea -0x48(%rsp),%rsp 1166 ___ 1167 $code.=<<___ if ($win64); 1168 lea -0xa0(%rsp), %rsp 1169 movaps %xmm6, 0x40(%rsp) 1170 movaps %xmm7, 0x50(%rsp) 1171 movaps %xmm8, 0x60(%rsp) 1172 movaps %xmm9, 0x70(%rsp) 1173 movaps %xmm10, 0x80(%rsp) 1174 movaps %xmm11, 0x90(%rsp) 1175 movaps %xmm12, 0xa0(%rsp) 1176 movaps %xmm13, 0xb0(%rsp) 1177 movaps %xmm14, 0xc0(%rsp) 1178 movaps %xmm15, 0xd0(%rsp) 1179 .Lecb_enc_body: 1180 ___ 1181 $code.=<<___; 1182 mov %rsp,%rbp # backup %rsp 1183 mov 240($arg4),%eax # rounds 1184 mov $arg1,$inp # backup arguments 1185 mov $arg2,$out 1186 mov $arg3,$len 1187 mov $arg4,$key 1188 cmp \$8,$arg3 1189 jb .Lecb_enc_short 1190 1191 mov %eax,%ebx # backup rounds 1192 shl \$7,%rax # 128 bytes per inner round key 1193 sub \$`128-32`,%rax # size of bit-sliced key schedule 1194 sub %rax,%rsp 1195 mov %rsp,%rax # pass key schedule 1196 mov $key,%rcx # pass key 1197 mov %ebx,%r10d # pass rounds 1198 call _bsaes_key_convert 1199 pxor %xmm6,%xmm7 # fix up last round key 1200 movdqa %xmm7,(%rax) # save last round key 1201 1202 sub \$8,$len 1203 .Lecb_enc_loop: 1204 movdqu 0x00($inp), @XMM[0] # load input 1205 movdqu 0x10($inp), @XMM[1] 1206 movdqu 0x20($inp), @XMM[2] 1207 movdqu 0x30($inp), @XMM[3] 1208 movdqu 0x40($inp), @XMM[4] 1209 movdqu 0x50($inp), @XMM[5] 1210 mov %rsp, %rax # pass key schedule 1211 movdqu 0x60($inp), @XMM[6] 1212 mov %ebx,%r10d # pass rounds 1213 movdqu 0x70($inp), @XMM[7] 1214 lea 0x80($inp), $inp 1215 1216 call _bsaes_encrypt8 1217 1218 movdqu @XMM[0], 0x00($out) # write output 1219 movdqu @XMM[1], 0x10($out) 1220 movdqu @XMM[4], 0x20($out) 1221 movdqu @XMM[6], 0x30($out) 1222 movdqu @XMM[3], 0x40($out) 1223 movdqu @XMM[7], 0x50($out) 1224 movdqu @XMM[2], 0x60($out) 1225 movdqu @XMM[5], 0x70($out) 1226 lea 0x80($out), $out 1227 sub \$8,$len 1228 jnc .Lecb_enc_loop 1229 1230 add \$8,$len 1231 jz .Lecb_enc_done 1232 1233 movdqu 0x00($inp), @XMM[0] # load input 1234 mov %rsp, %rax # pass key schedule 1235 mov %ebx,%r10d # pass rounds 1236 cmp \$2,$len 1237 jb .Lecb_enc_one 1238 movdqu 0x10($inp), @XMM[1] 1239 je .Lecb_enc_two 1240 movdqu 0x20($inp), @XMM[2] 1241 cmp \$4,$len 1242 jb .Lecb_enc_three 1243 movdqu 0x30($inp), @XMM[3] 1244 je .Lecb_enc_four 1245 movdqu 0x40($inp), @XMM[4] 1246 cmp \$6,$len 1247 jb .Lecb_enc_five 1248 movdqu 0x50($inp), @XMM[5] 1249 je .Lecb_enc_six 1250 movdqu 0x60($inp), @XMM[6] 1251 call _bsaes_encrypt8 1252 movdqu @XMM[0], 0x00($out) # write output 1253 movdqu @XMM[1], 0x10($out) 1254 movdqu @XMM[4], 0x20($out) 1255 movdqu @XMM[6], 0x30($out) 1256 movdqu @XMM[3], 0x40($out) 1257 movdqu @XMM[7], 0x50($out) 1258 movdqu @XMM[2], 0x60($out) 1259 jmp .Lecb_enc_done 1260 .align 16 1261 .Lecb_enc_six: 1262 call _bsaes_encrypt8 1263 movdqu @XMM[0], 0x00($out) # write output 1264 movdqu @XMM[1], 0x10($out) 1265 movdqu @XMM[4], 0x20($out) 1266 movdqu @XMM[6], 0x30($out) 1267 movdqu @XMM[3], 0x40($out) 1268 movdqu @XMM[7], 0x50($out) 1269 jmp .Lecb_enc_done 1270 .align 16 1271 .Lecb_enc_five: 1272 call _bsaes_encrypt8 1273 movdqu @XMM[0], 0x00($out) # write output 1274 movdqu @XMM[1], 0x10($out) 1275 movdqu @XMM[4], 0x20($out) 1276 movdqu @XMM[6], 0x30($out) 1277 movdqu @XMM[3], 0x40($out) 1278 jmp .Lecb_enc_done 1279 .align 16 1280 .Lecb_enc_four: 1281 call _bsaes_encrypt8 1282 movdqu @XMM[0], 0x00($out) # write output 1283 movdqu @XMM[1], 0x10($out) 1284 movdqu @XMM[4], 0x20($out) 1285 movdqu @XMM[6], 0x30($out) 1286 jmp .Lecb_enc_done 1287 .align 16 1288 .Lecb_enc_three: 1289 call _bsaes_encrypt8 1290 movdqu @XMM[0], 0x00($out) # write output 1291 movdqu @XMM[1], 0x10($out) 1292 movdqu @XMM[4], 0x20($out) 1293 jmp .Lecb_enc_done 1294 .align 16 1295 .Lecb_enc_two: 1296 call _bsaes_encrypt8 1297 movdqu @XMM[0], 0x00($out) # write output 1298 movdqu @XMM[1], 0x10($out) 1299 jmp .Lecb_enc_done 1300 .align 16 1301 .Lecb_enc_one: 1302 call _bsaes_encrypt8 1303 movdqu @XMM[0], 0x00($out) # write output 1304 jmp .Lecb_enc_done 1305 .align 16 1306 .Lecb_enc_short: 1307 lea ($inp), $arg1 1308 lea ($out), $arg2 1309 lea ($key), $arg3 1310 call asm_AES_encrypt 1311 lea 16($inp), $inp 1312 lea 16($out), $out 1313 dec $len 1314 jnz .Lecb_enc_short 1315 1316 .Lecb_enc_done: 1317 lea (%rsp),%rax 1318 pxor %xmm0, %xmm0 1319 .Lecb_enc_bzero: # wipe key schedule [if any] 1320 movdqa %xmm0, 0x00(%rax) 1321 movdqa %xmm0, 0x10(%rax) 1322 lea 0x20(%rax), %rax 1323 cmp %rax, %rbp 1324 jb .Lecb_enc_bzero 1325 1326 lea (%rbp),%rsp # restore %rsp 1327 ___ 1328 $code.=<<___ if ($win64); 1329 movaps 0x40(%rbp), %xmm6 1330 movaps 0x50(%rbp), %xmm7 1331 movaps 0x60(%rbp), %xmm8 1332 movaps 0x70(%rbp), %xmm9 1333 movaps 0x80(%rbp), %xmm10 1334 movaps 0x90(%rbp), %xmm11 1335 movaps 0xa0(%rbp), %xmm12 1336 movaps 0xb0(%rbp), %xmm13 1337 movaps 0xc0(%rbp), %xmm14 1338 movaps 0xd0(%rbp), %xmm15 1339 lea 0xa0(%rbp), %rsp 1340 ___ 1341 $code.=<<___; 1342 mov 0x48(%rsp), %r15 1343 mov 0x50(%rsp), %r14 1344 mov 0x58(%rsp), %r13 1345 mov 0x60(%rsp), %r12 1346 mov 0x68(%rsp), %rbx 1347 mov 0x70(%rsp), %rax 1348 lea 0x78(%rsp), %rsp 1349 mov %rax, %rbp 1350 .Lecb_enc_epilogue: 1351 ret 1352 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1353 1354 .globl bsaes_ecb_decrypt_blocks 1355 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1356 .align 16 1357 bsaes_ecb_decrypt_blocks: 1358 mov %rsp, %rax 1359 .Lecb_dec_prologue: 1360 push %rbp 1361 push %rbx 1362 push %r12 1363 push %r13 1364 push %r14 1365 push %r15 1366 lea -0x48(%rsp),%rsp 1367 ___ 1368 $code.=<<___ if ($win64); 1369 lea -0xa0(%rsp), %rsp 1370 movaps %xmm6, 0x40(%rsp) 1371 movaps %xmm7, 0x50(%rsp) 1372 movaps %xmm8, 0x60(%rsp) 1373 movaps %xmm9, 0x70(%rsp) 1374 movaps %xmm10, 0x80(%rsp) 1375 movaps %xmm11, 0x90(%rsp) 1376 movaps %xmm12, 0xa0(%rsp) 1377 movaps %xmm13, 0xb0(%rsp) 1378 movaps %xmm14, 0xc0(%rsp) 1379 movaps %xmm15, 0xd0(%rsp) 1380 .Lecb_dec_body: 1381 ___ 1382 $code.=<<___; 1383 mov %rsp,%rbp # backup %rsp 1384 mov 240($arg4),%eax # rounds 1385 mov $arg1,$inp # backup arguments 1386 mov $arg2,$out 1387 mov $arg3,$len 1388 mov $arg4,$key 1389 cmp \$8,$arg3 1390 jb .Lecb_dec_short 1391 1392 mov %eax,%ebx # backup rounds 1393 shl \$7,%rax # 128 bytes per inner round key 1394 sub \$`128-32`,%rax # size of bit-sliced key schedule 1395 sub %rax,%rsp 1396 mov %rsp,%rax # pass key schedule 1397 mov $key,%rcx # pass key 1398 mov %ebx,%r10d # pass rounds 1399 call _bsaes_key_convert 1400 pxor (%rsp),%xmm7 # fix up 0 round key 1401 movdqa %xmm6,(%rax) # save last round key 1402 movdqa %xmm7,(%rsp) 1403 1404 sub \$8,$len 1405 .Lecb_dec_loop: 1406 movdqu 0x00($inp), @XMM[0] # load input 1407 movdqu 0x10($inp), @XMM[1] 1408 movdqu 0x20($inp), @XMM[2] 1409 movdqu 0x30($inp), @XMM[3] 1410 movdqu 0x40($inp), @XMM[4] 1411 movdqu 0x50($inp), @XMM[5] 1412 mov %rsp, %rax # pass key schedule 1413 movdqu 0x60($inp), @XMM[6] 1414 mov %ebx,%r10d # pass rounds 1415 movdqu 0x70($inp), @XMM[7] 1416 lea 0x80($inp), $inp 1417 1418 call _bsaes_decrypt8 1419 1420 movdqu @XMM[0], 0x00($out) # write output 1421 movdqu @XMM[1], 0x10($out) 1422 movdqu @XMM[6], 0x20($out) 1423 movdqu @XMM[4], 0x30($out) 1424 movdqu @XMM[2], 0x40($out) 1425 movdqu @XMM[7], 0x50($out) 1426 movdqu @XMM[3], 0x60($out) 1427 movdqu @XMM[5], 0x70($out) 1428 lea 0x80($out), $out 1429 sub \$8,$len 1430 jnc .Lecb_dec_loop 1431 1432 add \$8,$len 1433 jz .Lecb_dec_done 1434 1435 movdqu 0x00($inp), @XMM[0] # load input 1436 mov %rsp, %rax # pass key schedule 1437 mov %ebx,%r10d # pass rounds 1438 cmp \$2,$len 1439 jb .Lecb_dec_one 1440 movdqu 0x10($inp), @XMM[1] 1441 je .Lecb_dec_two 1442 movdqu 0x20($inp), @XMM[2] 1443 cmp \$4,$len 1444 jb .Lecb_dec_three 1445 movdqu 0x30($inp), @XMM[3] 1446 je .Lecb_dec_four 1447 movdqu 0x40($inp), @XMM[4] 1448 cmp \$6,$len 1449 jb .Lecb_dec_five 1450 movdqu 0x50($inp), @XMM[5] 1451 je .Lecb_dec_six 1452 movdqu 0x60($inp), @XMM[6] 1453 call _bsaes_decrypt8 1454 movdqu @XMM[0], 0x00($out) # write output 1455 movdqu @XMM[1], 0x10($out) 1456 movdqu @XMM[6], 0x20($out) 1457 movdqu @XMM[4], 0x30($out) 1458 movdqu @XMM[2], 0x40($out) 1459 movdqu @XMM[7], 0x50($out) 1460 movdqu @XMM[3], 0x60($out) 1461 jmp .Lecb_dec_done 1462 .align 16 1463 .Lecb_dec_six: 1464 call _bsaes_decrypt8 1465 movdqu @XMM[0], 0x00($out) # write output 1466 movdqu @XMM[1], 0x10($out) 1467 movdqu @XMM[6], 0x20($out) 1468 movdqu @XMM[4], 0x30($out) 1469 movdqu @XMM[2], 0x40($out) 1470 movdqu @XMM[7], 0x50($out) 1471 jmp .Lecb_dec_done 1472 .align 16 1473 .Lecb_dec_five: 1474 call _bsaes_decrypt8 1475 movdqu @XMM[0], 0x00($out) # write output 1476 movdqu @XMM[1], 0x10($out) 1477 movdqu @XMM[6], 0x20($out) 1478 movdqu @XMM[4], 0x30($out) 1479 movdqu @XMM[2], 0x40($out) 1480 jmp .Lecb_dec_done 1481 .align 16 1482 .Lecb_dec_four: 1483 call _bsaes_decrypt8 1484 movdqu @XMM[0], 0x00($out) # write output 1485 movdqu @XMM[1], 0x10($out) 1486 movdqu @XMM[6], 0x20($out) 1487 movdqu @XMM[4], 0x30($out) 1488 jmp .Lecb_dec_done 1489 .align 16 1490 .Lecb_dec_three: 1491 call _bsaes_decrypt8 1492 movdqu @XMM[0], 0x00($out) # write output 1493 movdqu @XMM[1], 0x10($out) 1494 movdqu @XMM[6], 0x20($out) 1495 jmp .Lecb_dec_done 1496 .align 16 1497 .Lecb_dec_two: 1498 call _bsaes_decrypt8 1499 movdqu @XMM[0], 0x00($out) # write output 1500 movdqu @XMM[1], 0x10($out) 1501 jmp .Lecb_dec_done 1502 .align 16 1503 .Lecb_dec_one: 1504 call _bsaes_decrypt8 1505 movdqu @XMM[0], 0x00($out) # write output 1506 jmp .Lecb_dec_done 1507 .align 16 1508 .Lecb_dec_short: 1509 lea ($inp), $arg1 1510 lea ($out), $arg2 1511 lea ($key), $arg3 1512 call asm_AES_decrypt 1513 lea 16($inp), $inp 1514 lea 16($out), $out 1515 dec $len 1516 jnz .Lecb_dec_short 1517 1518 .Lecb_dec_done: 1519 lea (%rsp),%rax 1520 pxor %xmm0, %xmm0 1521 .Lecb_dec_bzero: # wipe key schedule [if any] 1522 movdqa %xmm0, 0x00(%rax) 1523 movdqa %xmm0, 0x10(%rax) 1524 lea 0x20(%rax), %rax 1525 cmp %rax, %rbp 1526 jb .Lecb_dec_bzero 1527 1528 lea (%rbp),%rsp # restore %rsp 1529 ___ 1530 $code.=<<___ if ($win64); 1531 movaps 0x40(%rbp), %xmm6 1532 movaps 0x50(%rbp), %xmm7 1533 movaps 0x60(%rbp), %xmm8 1534 movaps 0x70(%rbp), %xmm9 1535 movaps 0x80(%rbp), %xmm10 1536 movaps 0x90(%rbp), %xmm11 1537 movaps 0xa0(%rbp), %xmm12 1538 movaps 0xb0(%rbp), %xmm13 1539 movaps 0xc0(%rbp), %xmm14 1540 movaps 0xd0(%rbp), %xmm15 1541 lea 0xa0(%rbp), %rsp 1542 ___ 1543 $code.=<<___; 1544 mov 0x48(%rsp), %r15 1545 mov 0x50(%rsp), %r14 1546 mov 0x58(%rsp), %r13 1547 mov 0x60(%rsp), %r12 1548 mov 0x68(%rsp), %rbx 1549 mov 0x70(%rsp), %rax 1550 lea 0x78(%rsp), %rsp 1551 mov %rax, %rbp 1552 .Lecb_dec_epilogue: 1553 ret 1554 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1555 ___ 1556 } 1557 $code.=<<___; 1558 .extern asm_AES_cbc_encrypt 1559 .globl bsaes_cbc_encrypt 1560 .type bsaes_cbc_encrypt,\@abi-omnipotent 1561 .align 16 1562 bsaes_cbc_encrypt: 1563 ___ 1564 $code.=<<___ if ($win64); 1565 mov 48(%rsp),$arg6 # pull direction flag 1566 ___ 1567 $code.=<<___; 1568 cmp \$0,$arg6 1569 jne asm_AES_cbc_encrypt 1570 cmp \$128,$arg3 1571 jb asm_AES_cbc_encrypt 1572 1573 mov %rsp, %rax 1574 .Lcbc_dec_prologue: 1575 push %rbp 1576 push %rbx 1577 push %r12 1578 push %r13 1579 push %r14 1580 push %r15 1581 lea -0x48(%rsp), %rsp 1582 ___ 1583 $code.=<<___ if ($win64); 1584 mov 0xa0(%rsp),$arg5 # pull ivp 1585 lea -0xa0(%rsp), %rsp 1586 movaps %xmm6, 0x40(%rsp) 1587 movaps %xmm7, 0x50(%rsp) 1588 movaps %xmm8, 0x60(%rsp) 1589 movaps %xmm9, 0x70(%rsp) 1590 movaps %xmm10, 0x80(%rsp) 1591 movaps %xmm11, 0x90(%rsp) 1592 movaps %xmm12, 0xa0(%rsp) 1593 movaps %xmm13, 0xb0(%rsp) 1594 movaps %xmm14, 0xc0(%rsp) 1595 movaps %xmm15, 0xd0(%rsp) 1596 .Lcbc_dec_body: 1597 ___ 1598 $code.=<<___; 1599 mov %rsp, %rbp # backup %rsp 1600 mov 240($arg4), %eax # rounds 1601 mov $arg1, $inp # backup arguments 1602 mov $arg2, $out 1603 mov $arg3, $len 1604 mov $arg4, $key 1605 mov $arg5, %rbx 1606 shr \$4, $len # bytes to blocks 1607 1608 mov %eax, %edx # rounds 1609 shl \$7, %rax # 128 bytes per inner round key 1610 sub \$`128-32`, %rax # size of bit-sliced key schedule 1611 sub %rax, %rsp 1612 1613 mov %rsp, %rax # pass key schedule 1614 mov $key, %rcx # pass key 1615 mov %edx, %r10d # pass rounds 1616 call _bsaes_key_convert 1617 pxor (%rsp),%xmm7 # fix up 0 round key 1618 movdqa %xmm6,(%rax) # save last round key 1619 movdqa %xmm7,(%rsp) 1620 1621 movdqu (%rbx), @XMM[15] # load IV 1622 sub \$8,$len 1623 .Lcbc_dec_loop: 1624 movdqu 0x00($inp), @XMM[0] # load input 1625 movdqu 0x10($inp), @XMM[1] 1626 movdqu 0x20($inp), @XMM[2] 1627 movdqu 0x30($inp), @XMM[3] 1628 movdqu 0x40($inp), @XMM[4] 1629 movdqu 0x50($inp), @XMM[5] 1630 mov %rsp, %rax # pass key schedule 1631 movdqu 0x60($inp), @XMM[6] 1632 mov %edx,%r10d # pass rounds 1633 movdqu 0x70($inp), @XMM[7] 1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1635 1636 call _bsaes_decrypt8 1637 1638 pxor 0x20(%rbp), @XMM[0] # ^= IV 1639 movdqu 0x00($inp), @XMM[8] # re-load input 1640 movdqu 0x10($inp), @XMM[9] 1641 pxor @XMM[8], @XMM[1] 1642 movdqu 0x20($inp), @XMM[10] 1643 pxor @XMM[9], @XMM[6] 1644 movdqu 0x30($inp), @XMM[11] 1645 pxor @XMM[10], @XMM[4] 1646 movdqu 0x40($inp), @XMM[12] 1647 pxor @XMM[11], @XMM[2] 1648 movdqu 0x50($inp), @XMM[13] 1649 pxor @XMM[12], @XMM[7] 1650 movdqu 0x60($inp), @XMM[14] 1651 pxor @XMM[13], @XMM[3] 1652 movdqu 0x70($inp), @XMM[15] # IV 1653 pxor @XMM[14], @XMM[5] 1654 movdqu @XMM[0], 0x00($out) # write output 1655 lea 0x80($inp), $inp 1656 movdqu @XMM[1], 0x10($out) 1657 movdqu @XMM[6], 0x20($out) 1658 movdqu @XMM[4], 0x30($out) 1659 movdqu @XMM[2], 0x40($out) 1660 movdqu @XMM[7], 0x50($out) 1661 movdqu @XMM[3], 0x60($out) 1662 movdqu @XMM[5], 0x70($out) 1663 lea 0x80($out), $out 1664 sub \$8,$len 1665 jnc .Lcbc_dec_loop 1666 1667 add \$8,$len 1668 jz .Lcbc_dec_done 1669 1670 movdqu 0x00($inp), @XMM[0] # load input 1671 mov %rsp, %rax # pass key schedule 1672 mov %edx, %r10d # pass rounds 1673 cmp \$2,$len 1674 jb .Lcbc_dec_one 1675 movdqu 0x10($inp), @XMM[1] 1676 je .Lcbc_dec_two 1677 movdqu 0x20($inp), @XMM[2] 1678 cmp \$4,$len 1679 jb .Lcbc_dec_three 1680 movdqu 0x30($inp), @XMM[3] 1681 je .Lcbc_dec_four 1682 movdqu 0x40($inp), @XMM[4] 1683 cmp \$6,$len 1684 jb .Lcbc_dec_five 1685 movdqu 0x50($inp), @XMM[5] 1686 je .Lcbc_dec_six 1687 movdqu 0x60($inp), @XMM[6] 1688 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1689 call _bsaes_decrypt8 1690 pxor 0x20(%rbp), @XMM[0] # ^= IV 1691 movdqu 0x00($inp), @XMM[8] # re-load input 1692 movdqu 0x10($inp), @XMM[9] 1693 pxor @XMM[8], @XMM[1] 1694 movdqu 0x20($inp), @XMM[10] 1695 pxor @XMM[9], @XMM[6] 1696 movdqu 0x30($inp), @XMM[11] 1697 pxor @XMM[10], @XMM[4] 1698 movdqu 0x40($inp), @XMM[12] 1699 pxor @XMM[11], @XMM[2] 1700 movdqu 0x50($inp), @XMM[13] 1701 pxor @XMM[12], @XMM[7] 1702 movdqu 0x60($inp), @XMM[15] # IV 1703 pxor @XMM[13], @XMM[3] 1704 movdqu @XMM[0], 0x00($out) # write output 1705 movdqu @XMM[1], 0x10($out) 1706 movdqu @XMM[6], 0x20($out) 1707 movdqu @XMM[4], 0x30($out) 1708 movdqu @XMM[2], 0x40($out) 1709 movdqu @XMM[7], 0x50($out) 1710 movdqu @XMM[3], 0x60($out) 1711 jmp .Lcbc_dec_done 1712 .align 16 1713 .Lcbc_dec_six: 1714 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1715 call _bsaes_decrypt8 1716 pxor 0x20(%rbp), @XMM[0] # ^= IV 1717 movdqu 0x00($inp), @XMM[8] # re-load input 1718 movdqu 0x10($inp), @XMM[9] 1719 pxor @XMM[8], @XMM[1] 1720 movdqu 0x20($inp), @XMM[10] 1721 pxor @XMM[9], @XMM[6] 1722 movdqu 0x30($inp), @XMM[11] 1723 pxor @XMM[10], @XMM[4] 1724 movdqu 0x40($inp), @XMM[12] 1725 pxor @XMM[11], @XMM[2] 1726 movdqu 0x50($inp), @XMM[15] # IV 1727 pxor @XMM[12], @XMM[7] 1728 movdqu @XMM[0], 0x00($out) # write output 1729 movdqu @XMM[1], 0x10($out) 1730 movdqu @XMM[6], 0x20($out) 1731 movdqu @XMM[4], 0x30($out) 1732 movdqu @XMM[2], 0x40($out) 1733 movdqu @XMM[7], 0x50($out) 1734 jmp .Lcbc_dec_done 1735 .align 16 1736 .Lcbc_dec_five: 1737 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1738 call _bsaes_decrypt8 1739 pxor 0x20(%rbp), @XMM[0] # ^= IV 1740 movdqu 0x00($inp), @XMM[8] # re-load input 1741 movdqu 0x10($inp), @XMM[9] 1742 pxor @XMM[8], @XMM[1] 1743 movdqu 0x20($inp), @XMM[10] 1744 pxor @XMM[9], @XMM[6] 1745 movdqu 0x30($inp), @XMM[11] 1746 pxor @XMM[10], @XMM[4] 1747 movdqu 0x40($inp), @XMM[15] # IV 1748 pxor @XMM[11], @XMM[2] 1749 movdqu @XMM[0], 0x00($out) # write output 1750 movdqu @XMM[1], 0x10($out) 1751 movdqu @XMM[6], 0x20($out) 1752 movdqu @XMM[4], 0x30($out) 1753 movdqu @XMM[2], 0x40($out) 1754 jmp .Lcbc_dec_done 1755 .align 16 1756 .Lcbc_dec_four: 1757 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1758 call _bsaes_decrypt8 1759 pxor 0x20(%rbp), @XMM[0] # ^= IV 1760 movdqu 0x00($inp), @XMM[8] # re-load input 1761 movdqu 0x10($inp), @XMM[9] 1762 pxor @XMM[8], @XMM[1] 1763 movdqu 0x20($inp), @XMM[10] 1764 pxor @XMM[9], @XMM[6] 1765 movdqu 0x30($inp), @XMM[15] # IV 1766 pxor @XMM[10], @XMM[4] 1767 movdqu @XMM[0], 0x00($out) # write output 1768 movdqu @XMM[1], 0x10($out) 1769 movdqu @XMM[6], 0x20($out) 1770 movdqu @XMM[4], 0x30($out) 1771 jmp .Lcbc_dec_done 1772 .align 16 1773 .Lcbc_dec_three: 1774 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1775 call _bsaes_decrypt8 1776 pxor 0x20(%rbp), @XMM[0] # ^= IV 1777 movdqu 0x00($inp), @XMM[8] # re-load input 1778 movdqu 0x10($inp), @XMM[9] 1779 pxor @XMM[8], @XMM[1] 1780 movdqu 0x20($inp), @XMM[15] # IV 1781 pxor @XMM[9], @XMM[6] 1782 movdqu @XMM[0], 0x00($out) # write output 1783 movdqu @XMM[1], 0x10($out) 1784 movdqu @XMM[6], 0x20($out) 1785 jmp .Lcbc_dec_done 1786 .align 16 1787 .Lcbc_dec_two: 1788 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1789 call _bsaes_decrypt8 1790 pxor 0x20(%rbp), @XMM[0] # ^= IV 1791 movdqu 0x00($inp), @XMM[8] # re-load input 1792 movdqu 0x10($inp), @XMM[15] # IV 1793 pxor @XMM[8], @XMM[1] 1794 movdqu @XMM[0], 0x00($out) # write output 1795 movdqu @XMM[1], 0x10($out) 1796 jmp .Lcbc_dec_done 1797 .align 16 1798 .Lcbc_dec_one: 1799 lea ($inp), $arg1 1800 lea 0x20(%rbp), $arg2 # buffer output 1801 lea ($key), $arg3 1802 call asm_AES_decrypt # doesn't touch %xmm 1803 pxor 0x20(%rbp), @XMM[15] # ^= IV 1804 movdqu @XMM[15], ($out) # write output 1805 movdqa @XMM[0], @XMM[15] # IV 1806 1807 .Lcbc_dec_done: 1808 movdqu @XMM[15], (%rbx) # return IV 1809 lea (%rsp), %rax 1810 pxor %xmm0, %xmm0 1811 .Lcbc_dec_bzero: # wipe key schedule [if any] 1812 movdqa %xmm0, 0x00(%rax) 1813 movdqa %xmm0, 0x10(%rax) 1814 lea 0x20(%rax), %rax 1815 cmp %rax, %rbp 1816 ja .Lcbc_dec_bzero 1817 1818 lea (%rbp),%rsp # restore %rsp 1819 ___ 1820 $code.=<<___ if ($win64); 1821 movaps 0x40(%rbp), %xmm6 1822 movaps 0x50(%rbp), %xmm7 1823 movaps 0x60(%rbp), %xmm8 1824 movaps 0x70(%rbp), %xmm9 1825 movaps 0x80(%rbp), %xmm10 1826 movaps 0x90(%rbp), %xmm11 1827 movaps 0xa0(%rbp), %xmm12 1828 movaps 0xb0(%rbp), %xmm13 1829 movaps 0xc0(%rbp), %xmm14 1830 movaps 0xd0(%rbp), %xmm15 1831 lea 0xa0(%rbp), %rsp 1832 ___ 1833 $code.=<<___; 1834 mov 0x48(%rsp), %r15 1835 mov 0x50(%rsp), %r14 1836 mov 0x58(%rsp), %r13 1837 mov 0x60(%rsp), %r12 1838 mov 0x68(%rsp), %rbx 1839 mov 0x70(%rsp), %rax 1840 lea 0x78(%rsp), %rsp 1841 mov %rax, %rbp 1842 .Lcbc_dec_epilogue: 1843 ret 1844 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1845 1846 .globl bsaes_ctr32_encrypt_blocks 1847 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1848 .align 16 1849 bsaes_ctr32_encrypt_blocks: 1850 mov %rsp, %rax 1851 .Lctr_enc_prologue: 1852 push %rbp 1853 push %rbx 1854 push %r12 1855 push %r13 1856 push %r14 1857 push %r15 1858 lea -0x48(%rsp), %rsp 1859 ___ 1860 $code.=<<___ if ($win64); 1861 mov 0xa0(%rsp),$arg5 # pull ivp 1862 lea -0xa0(%rsp), %rsp 1863 movaps %xmm6, 0x40(%rsp) 1864 movaps %xmm7, 0x50(%rsp) 1865 movaps %xmm8, 0x60(%rsp) 1866 movaps %xmm9, 0x70(%rsp) 1867 movaps %xmm10, 0x80(%rsp) 1868 movaps %xmm11, 0x90(%rsp) 1869 movaps %xmm12, 0xa0(%rsp) 1870 movaps %xmm13, 0xb0(%rsp) 1871 movaps %xmm14, 0xc0(%rsp) 1872 movaps %xmm15, 0xd0(%rsp) 1873 .Lctr_enc_body: 1874 ___ 1875 $code.=<<___; 1876 mov %rsp, %rbp # backup %rsp 1877 movdqu ($arg5), %xmm0 # load counter 1878 mov 240($arg4), %eax # rounds 1879 mov $arg1, $inp # backup arguments 1880 mov $arg2, $out 1881 mov $arg3, $len 1882 mov $arg4, $key 1883 movdqa %xmm0, 0x20(%rbp) # copy counter 1884 cmp \$8, $arg3 1885 jb .Lctr_enc_short 1886 1887 mov %eax, %ebx # rounds 1888 shl \$7, %rax # 128 bytes per inner round key 1889 sub \$`128-32`, %rax # size of bit-sliced key schedule 1890 sub %rax, %rsp 1891 1892 mov %rsp, %rax # pass key schedule 1893 mov $key, %rcx # pass key 1894 mov %ebx, %r10d # pass rounds 1895 call _bsaes_key_convert 1896 pxor %xmm6,%xmm7 # fix up last round key 1897 movdqa %xmm7,(%rax) # save last round key 1898 1899 movdqa (%rsp), @XMM[9] # load round0 key 1900 lea .LADD1(%rip), %r11 1901 movdqa 0x20(%rbp), @XMM[0] # counter copy 1902 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1903 pshufb @XMM[8], @XMM[9] # byte swap upper part 1904 pshufb @XMM[8], @XMM[0] 1905 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1906 jmp .Lctr_enc_loop 1907 .align 16 1908 .Lctr_enc_loop: 1909 movdqa @XMM[0], 0x20(%rbp) # save counter 1910 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1911 movdqa @XMM[0], @XMM[2] 1912 paddd 0x00(%r11), @XMM[1] # .LADD1 1913 movdqa @XMM[0], @XMM[3] 1914 paddd 0x10(%r11), @XMM[2] # .LADD2 1915 movdqa @XMM[0], @XMM[4] 1916 paddd 0x20(%r11), @XMM[3] # .LADD3 1917 movdqa @XMM[0], @XMM[5] 1918 paddd 0x30(%r11), @XMM[4] # .LADD4 1919 movdqa @XMM[0], @XMM[6] 1920 paddd 0x40(%r11), @XMM[5] # .LADD5 1921 movdqa @XMM[0], @XMM[7] 1922 paddd 0x50(%r11), @XMM[6] # .LADD6 1923 paddd 0x60(%r11), @XMM[7] # .LADD7 1924 1925 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1926 # to flip byte order in 32-bit counter 1927 movdqa (%rsp), @XMM[9] # round 0 key 1928 lea 0x10(%rsp), %rax # pass key schedule 1929 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1930 pxor @XMM[9], @XMM[0] # xor with round0 key 1931 pxor @XMM[9], @XMM[1] 1932 pxor @XMM[9], @XMM[2] 1933 pxor @XMM[9], @XMM[3] 1934 pshufb @XMM[8], @XMM[0] 1935 pshufb @XMM[8], @XMM[1] 1936 pxor @XMM[9], @XMM[4] 1937 pxor @XMM[9], @XMM[5] 1938 pshufb @XMM[8], @XMM[2] 1939 pshufb @XMM[8], @XMM[3] 1940 pxor @XMM[9], @XMM[6] 1941 pxor @XMM[9], @XMM[7] 1942 pshufb @XMM[8], @XMM[4] 1943 pshufb @XMM[8], @XMM[5] 1944 pshufb @XMM[8], @XMM[6] 1945 pshufb @XMM[8], @XMM[7] 1946 lea .LBS0(%rip), %r11 # constants table 1947 mov %ebx,%r10d # pass rounds 1948 1949 call _bsaes_encrypt8_bitslice 1950 1951 sub \$8,$len 1952 jc .Lctr_enc_loop_done 1953 1954 movdqu 0x00($inp), @XMM[8] # load input 1955 movdqu 0x10($inp), @XMM[9] 1956 movdqu 0x20($inp), @XMM[10] 1957 movdqu 0x30($inp), @XMM[11] 1958 movdqu 0x40($inp), @XMM[12] 1959 movdqu 0x50($inp), @XMM[13] 1960 movdqu 0x60($inp), @XMM[14] 1961 movdqu 0x70($inp), @XMM[15] 1962 lea 0x80($inp),$inp 1963 pxor @XMM[0], @XMM[8] 1964 movdqa 0x20(%rbp), @XMM[0] # load counter 1965 pxor @XMM[9], @XMM[1] 1966 movdqu @XMM[8], 0x00($out) # write output 1967 pxor @XMM[10], @XMM[4] 1968 movdqu @XMM[1], 0x10($out) 1969 pxor @XMM[11], @XMM[6] 1970 movdqu @XMM[4], 0x20($out) 1971 pxor @XMM[12], @XMM[3] 1972 movdqu @XMM[6], 0x30($out) 1973 pxor @XMM[13], @XMM[7] 1974 movdqu @XMM[3], 0x40($out) 1975 pxor @XMM[14], @XMM[2] 1976 movdqu @XMM[7], 0x50($out) 1977 pxor @XMM[15], @XMM[5] 1978 movdqu @XMM[2], 0x60($out) 1979 lea .LADD1(%rip), %r11 1980 movdqu @XMM[5], 0x70($out) 1981 lea 0x80($out), $out 1982 paddd 0x70(%r11), @XMM[0] # .LADD8 1983 jnz .Lctr_enc_loop 1984 1985 jmp .Lctr_enc_done 1986 .align 16 1987 .Lctr_enc_loop_done: 1988 add \$8, $len 1989 movdqu 0x00($inp), @XMM[8] # load input 1990 pxor @XMM[8], @XMM[0] 1991 movdqu @XMM[0], 0x00($out) # write output 1992 cmp \$2,$len 1993 jb .Lctr_enc_done 1994 movdqu 0x10($inp), @XMM[9] 1995 pxor @XMM[9], @XMM[1] 1996 movdqu @XMM[1], 0x10($out) 1997 je .Lctr_enc_done 1998 movdqu 0x20($inp), @XMM[10] 1999 pxor @XMM[10], @XMM[4] 2000 movdqu @XMM[4], 0x20($out) 2001 cmp \$4,$len 2002 jb .Lctr_enc_done 2003 movdqu 0x30($inp), @XMM[11] 2004 pxor @XMM[11], @XMM[6] 2005 movdqu @XMM[6], 0x30($out) 2006 je .Lctr_enc_done 2007 movdqu 0x40($inp), @XMM[12] 2008 pxor @XMM[12], @XMM[3] 2009 movdqu @XMM[3], 0x40($out) 2010 cmp \$6,$len 2011 jb .Lctr_enc_done 2012 movdqu 0x50($inp), @XMM[13] 2013 pxor @XMM[13], @XMM[7] 2014 movdqu @XMM[7], 0x50($out) 2015 je .Lctr_enc_done 2016 movdqu 0x60($inp), @XMM[14] 2017 pxor @XMM[14], @XMM[2] 2018 movdqu @XMM[2], 0x60($out) 2019 jmp .Lctr_enc_done 2020 2021 .align 16 2022 .Lctr_enc_short: 2023 lea 0x20(%rbp), $arg1 2024 lea 0x30(%rbp), $arg2 2025 lea ($key), $arg3 2026 call asm_AES_encrypt 2027 movdqu ($inp), @XMM[1] 2028 lea 16($inp), $inp 2029 mov 0x2c(%rbp), %eax # load 32-bit counter 2030 bswap %eax 2031 pxor 0x30(%rbp), @XMM[1] 2032 inc %eax # increment 2033 movdqu @XMM[1], ($out) 2034 bswap %eax 2035 lea 16($out), $out 2036 mov %eax, 0x2c(%rsp) # save 32-bit counter 2037 dec $len 2038 jnz .Lctr_enc_short 2039 2040 .Lctr_enc_done: 2041 lea (%rsp), %rax 2042 pxor %xmm0, %xmm0 2043 .Lctr_enc_bzero: # wipe key schedule [if any] 2044 movdqa %xmm0, 0x00(%rax) 2045 movdqa %xmm0, 0x10(%rax) 2046 lea 0x20(%rax), %rax 2047 cmp %rax, %rbp 2048 ja .Lctr_enc_bzero 2049 2050 lea (%rbp),%rsp # restore %rsp 2051 ___ 2052 $code.=<<___ if ($win64); 2053 movaps 0x40(%rbp), %xmm6 2054 movaps 0x50(%rbp), %xmm7 2055 movaps 0x60(%rbp), %xmm8 2056 movaps 0x70(%rbp), %xmm9 2057 movaps 0x80(%rbp), %xmm10 2058 movaps 0x90(%rbp), %xmm11 2059 movaps 0xa0(%rbp), %xmm12 2060 movaps 0xb0(%rbp), %xmm13 2061 movaps 0xc0(%rbp), %xmm14 2062 movaps 0xd0(%rbp), %xmm15 2063 lea 0xa0(%rbp), %rsp 2064 ___ 2065 $code.=<<___; 2066 mov 0x48(%rsp), %r15 2067 mov 0x50(%rsp), %r14 2068 mov 0x58(%rsp), %r13 2069 mov 0x60(%rsp), %r12 2070 mov 0x68(%rsp), %rbx 2071 mov 0x70(%rsp), %rax 2072 lea 0x78(%rsp), %rsp 2073 mov %rax, %rbp 2074 .Lctr_enc_epilogue: 2075 ret 2076 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2077 ___ 2078 ###################################################################### 2079 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2080 # const AES_KEY *key1, const AES_KEY *key2, 2081 # const unsigned char iv[16]); 2082 # 2083 my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2084 $arg6=~s/d$//; 2085 2086 $code.=<<___; 2087 .globl bsaes_xts_encrypt 2088 .type bsaes_xts_encrypt,\@abi-omnipotent 2089 .align 16 2090 bsaes_xts_encrypt: 2091 mov %rsp, %rax 2092 .Lxts_enc_prologue: 2093 push %rbp 2094 push %rbx 2095 push %r12 2096 push %r13 2097 push %r14 2098 push %r15 2099 lea -0x48(%rsp), %rsp 2100 ___ 2101 $code.=<<___ if ($win64); 2102 mov 0xa0(%rsp),$arg5 # pull key2 2103 mov 0xa8(%rsp),$arg6 # pull ivp 2104 lea -0xa0(%rsp), %rsp 2105 movaps %xmm6, 0x40(%rsp) 2106 movaps %xmm7, 0x50(%rsp) 2107 movaps %xmm8, 0x60(%rsp) 2108 movaps %xmm9, 0x70(%rsp) 2109 movaps %xmm10, 0x80(%rsp) 2110 movaps %xmm11, 0x90(%rsp) 2111 movaps %xmm12, 0xa0(%rsp) 2112 movaps %xmm13, 0xb0(%rsp) 2113 movaps %xmm14, 0xc0(%rsp) 2114 movaps %xmm15, 0xd0(%rsp) 2115 .Lxts_enc_body: 2116 ___ 2117 $code.=<<___; 2118 mov %rsp, %rbp # backup %rsp 2119 mov $arg1, $inp # backup arguments 2120 mov $arg2, $out 2121 mov $arg3, $len 2122 mov $arg4, $key 2123 2124 lea ($arg6), $arg1 2125 lea 0x20(%rbp), $arg2 2126 lea ($arg5), $arg3 2127 call asm_AES_encrypt # generate initial tweak 2128 2129 mov 240($key), %eax # rounds 2130 mov $len, %rbx # backup $len 2131 2132 mov %eax, %edx # rounds 2133 shl \$7, %rax # 128 bytes per inner round key 2134 sub \$`128-32`, %rax # size of bit-sliced key schedule 2135 sub %rax, %rsp 2136 2137 mov %rsp, %rax # pass key schedule 2138 mov $key, %rcx # pass key 2139 mov %edx, %r10d # pass rounds 2140 call _bsaes_key_convert 2141 pxor %xmm6, %xmm7 # fix up last round key 2142 movdqa %xmm7, (%rax) # save last round key 2143 2144 and \$-16, $len 2145 sub \$0x80, %rsp # place for tweak[8] 2146 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2147 2148 pxor $twtmp, $twtmp 2149 movdqa .Lxts_magic(%rip), $twmask 2150 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2151 2152 sub \$0x80, $len 2153 jc .Lxts_enc_short 2154 jmp .Lxts_enc_loop 2155 2156 .align 16 2157 .Lxts_enc_loop: 2158 ___ 2159 for ($i=0;$i<7;$i++) { 2160 $code.=<<___; 2161 pshufd \$0x13, $twtmp, $twres 2162 pxor $twtmp, $twtmp 2163 movdqa @XMM[7], @XMM[$i] 2164 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2165 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2166 pand $twmask, $twres # isolate carry and residue 2167 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2168 pxor $twres, @XMM[7] 2169 ___ 2170 $code.=<<___ if ($i>=1); 2171 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2172 ___ 2173 $code.=<<___ if ($i>=2); 2174 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2175 ___ 2176 } 2177 $code.=<<___; 2178 movdqu 0x60($inp), @XMM[8+6] 2179 pxor @XMM[8+5], @XMM[5] 2180 movdqu 0x70($inp), @XMM[8+7] 2181 lea 0x80($inp), $inp 2182 movdqa @XMM[7], 0x70(%rsp) 2183 pxor @XMM[8+6], @XMM[6] 2184 lea 0x80(%rsp), %rax # pass key schedule 2185 pxor @XMM[8+7], @XMM[7] 2186 mov %edx, %r10d # pass rounds 2187 2188 call _bsaes_encrypt8 2189 2190 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2191 pxor 0x10(%rsp), @XMM[1] 2192 movdqu @XMM[0], 0x00($out) # write output 2193 pxor 0x20(%rsp), @XMM[4] 2194 movdqu @XMM[1], 0x10($out) 2195 pxor 0x30(%rsp), @XMM[6] 2196 movdqu @XMM[4], 0x20($out) 2197 pxor 0x40(%rsp), @XMM[3] 2198 movdqu @XMM[6], 0x30($out) 2199 pxor 0x50(%rsp), @XMM[7] 2200 movdqu @XMM[3], 0x40($out) 2201 pxor 0x60(%rsp), @XMM[2] 2202 movdqu @XMM[7], 0x50($out) 2203 pxor 0x70(%rsp), @XMM[5] 2204 movdqu @XMM[2], 0x60($out) 2205 movdqu @XMM[5], 0x70($out) 2206 lea 0x80($out), $out 2207 2208 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2209 pxor $twtmp, $twtmp 2210 movdqa .Lxts_magic(%rip), $twmask 2211 pcmpgtd @XMM[7], $twtmp 2212 pshufd \$0x13, $twtmp, $twres 2213 pxor $twtmp, $twtmp 2214 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2215 pand $twmask, $twres # isolate carry and residue 2216 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2217 pxor $twres, @XMM[7] 2218 2219 sub \$0x80,$len 2220 jnc .Lxts_enc_loop 2221 2222 .Lxts_enc_short: 2223 add \$0x80, $len 2224 jz .Lxts_enc_done 2225 ___ 2226 for ($i=0;$i<7;$i++) { 2227 $code.=<<___; 2228 pshufd \$0x13, $twtmp, $twres 2229 pxor $twtmp, $twtmp 2230 movdqa @XMM[7], @XMM[$i] 2231 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2232 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2233 pand $twmask, $twres # isolate carry and residue 2234 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2235 pxor $twres, @XMM[7] 2236 ___ 2237 $code.=<<___ if ($i>=1); 2238 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2239 cmp \$`0x10*$i`,$len 2240 je .Lxts_enc_$i 2241 ___ 2242 $code.=<<___ if ($i>=2); 2243 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2244 ___ 2245 } 2246 $code.=<<___; 2247 movdqu 0x60($inp), @XMM[8+6] 2248 pxor @XMM[8+5], @XMM[5] 2249 movdqa @XMM[7], 0x70(%rsp) 2250 lea 0x70($inp), $inp 2251 pxor @XMM[8+6], @XMM[6] 2252 lea 0x80(%rsp), %rax # pass key schedule 2253 mov %edx, %r10d # pass rounds 2254 2255 call _bsaes_encrypt8 2256 2257 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2258 pxor 0x10(%rsp), @XMM[1] 2259 movdqu @XMM[0], 0x00($out) # write output 2260 pxor 0x20(%rsp), @XMM[4] 2261 movdqu @XMM[1], 0x10($out) 2262 pxor 0x30(%rsp), @XMM[6] 2263 movdqu @XMM[4], 0x20($out) 2264 pxor 0x40(%rsp), @XMM[3] 2265 movdqu @XMM[6], 0x30($out) 2266 pxor 0x50(%rsp), @XMM[7] 2267 movdqu @XMM[3], 0x40($out) 2268 pxor 0x60(%rsp), @XMM[2] 2269 movdqu @XMM[7], 0x50($out) 2270 movdqu @XMM[2], 0x60($out) 2271 lea 0x70($out), $out 2272 2273 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2274 jmp .Lxts_enc_done 2275 .align 16 2276 .Lxts_enc_6: 2277 pxor @XMM[8+4], @XMM[4] 2278 lea 0x60($inp), $inp 2279 pxor @XMM[8+5], @XMM[5] 2280 lea 0x80(%rsp), %rax # pass key schedule 2281 mov %edx, %r10d # pass rounds 2282 2283 call _bsaes_encrypt8 2284 2285 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2286 pxor 0x10(%rsp), @XMM[1] 2287 movdqu @XMM[0], 0x00($out) # write output 2288 pxor 0x20(%rsp), @XMM[4] 2289 movdqu @XMM[1], 0x10($out) 2290 pxor 0x30(%rsp), @XMM[6] 2291 movdqu @XMM[4], 0x20($out) 2292 pxor 0x40(%rsp), @XMM[3] 2293 movdqu @XMM[6], 0x30($out) 2294 pxor 0x50(%rsp), @XMM[7] 2295 movdqu @XMM[3], 0x40($out) 2296 movdqu @XMM[7], 0x50($out) 2297 lea 0x60($out), $out 2298 2299 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2300 jmp .Lxts_enc_done 2301 .align 16 2302 .Lxts_enc_5: 2303 pxor @XMM[8+3], @XMM[3] 2304 lea 0x50($inp), $inp 2305 pxor @XMM[8+4], @XMM[4] 2306 lea 0x80(%rsp), %rax # pass key schedule 2307 mov %edx, %r10d # pass rounds 2308 2309 call _bsaes_encrypt8 2310 2311 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2312 pxor 0x10(%rsp), @XMM[1] 2313 movdqu @XMM[0], 0x00($out) # write output 2314 pxor 0x20(%rsp), @XMM[4] 2315 movdqu @XMM[1], 0x10($out) 2316 pxor 0x30(%rsp), @XMM[6] 2317 movdqu @XMM[4], 0x20($out) 2318 pxor 0x40(%rsp), @XMM[3] 2319 movdqu @XMM[6], 0x30($out) 2320 movdqu @XMM[3], 0x40($out) 2321 lea 0x50($out), $out 2322 2323 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2324 jmp .Lxts_enc_done 2325 .align 16 2326 .Lxts_enc_4: 2327 pxor @XMM[8+2], @XMM[2] 2328 lea 0x40($inp), $inp 2329 pxor @XMM[8+3], @XMM[3] 2330 lea 0x80(%rsp), %rax # pass key schedule 2331 mov %edx, %r10d # pass rounds 2332 2333 call _bsaes_encrypt8 2334 2335 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2336 pxor 0x10(%rsp), @XMM[1] 2337 movdqu @XMM[0], 0x00($out) # write output 2338 pxor 0x20(%rsp), @XMM[4] 2339 movdqu @XMM[1], 0x10($out) 2340 pxor 0x30(%rsp), @XMM[6] 2341 movdqu @XMM[4], 0x20($out) 2342 movdqu @XMM[6], 0x30($out) 2343 lea 0x40($out), $out 2344 2345 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2346 jmp .Lxts_enc_done 2347 .align 16 2348 .Lxts_enc_3: 2349 pxor @XMM[8+1], @XMM[1] 2350 lea 0x30($inp), $inp 2351 pxor @XMM[8+2], @XMM[2] 2352 lea 0x80(%rsp), %rax # pass key schedule 2353 mov %edx, %r10d # pass rounds 2354 2355 call _bsaes_encrypt8 2356 2357 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2358 pxor 0x10(%rsp), @XMM[1] 2359 movdqu @XMM[0], 0x00($out) # write output 2360 pxor 0x20(%rsp), @XMM[4] 2361 movdqu @XMM[1], 0x10($out) 2362 movdqu @XMM[4], 0x20($out) 2363 lea 0x30($out), $out 2364 2365 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2366 jmp .Lxts_enc_done 2367 .align 16 2368 .Lxts_enc_2: 2369 pxor @XMM[8+0], @XMM[0] 2370 lea 0x20($inp), $inp 2371 pxor @XMM[8+1], @XMM[1] 2372 lea 0x80(%rsp), %rax # pass key schedule 2373 mov %edx, %r10d # pass rounds 2374 2375 call _bsaes_encrypt8 2376 2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2378 pxor 0x10(%rsp), @XMM[1] 2379 movdqu @XMM[0], 0x00($out) # write output 2380 movdqu @XMM[1], 0x10($out) 2381 lea 0x20($out), $out 2382 2383 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2384 jmp .Lxts_enc_done 2385 .align 16 2386 .Lxts_enc_1: 2387 pxor @XMM[0], @XMM[8] 2388 lea 0x10($inp), $inp 2389 movdqa @XMM[8], 0x20(%rbp) 2390 lea 0x20(%rbp), $arg1 2391 lea 0x20(%rbp), $arg2 2392 lea ($key), $arg3 2393 call asm_AES_encrypt # doesn't touch %xmm 2394 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2395 #pxor @XMM[8], @XMM[0] 2396 #lea 0x80(%rsp), %rax # pass key schedule 2397 #mov %edx, %r10d # pass rounds 2398 #call _bsaes_encrypt8 2399 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2400 movdqu @XMM[0], 0x00($out) # write output 2401 lea 0x10($out), $out 2402 2403 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2404 2405 .Lxts_enc_done: 2406 and \$15, %ebx 2407 jz .Lxts_enc_ret 2408 mov $out, %rdx 2409 2410 .Lxts_enc_steal: 2411 movzb ($inp), %eax 2412 movzb -16(%rdx), %ecx 2413 lea 1($inp), $inp 2414 mov %al, -16(%rdx) 2415 mov %cl, 0(%rdx) 2416 lea 1(%rdx), %rdx 2417 sub \$1,%ebx 2418 jnz .Lxts_enc_steal 2419 2420 movdqu -16($out), @XMM[0] 2421 lea 0x20(%rbp), $arg1 2422 pxor @XMM[7], @XMM[0] 2423 lea 0x20(%rbp), $arg2 2424 movdqa @XMM[0], 0x20(%rbp) 2425 lea ($key), $arg3 2426 call asm_AES_encrypt # doesn't touch %xmm 2427 pxor 0x20(%rbp), @XMM[7] 2428 movdqu @XMM[7], -16($out) 2429 2430 .Lxts_enc_ret: 2431 lea (%rsp), %rax 2432 pxor %xmm0, %xmm0 2433 .Lxts_enc_bzero: # wipe key schedule [if any] 2434 movdqa %xmm0, 0x00(%rax) 2435 movdqa %xmm0, 0x10(%rax) 2436 lea 0x20(%rax), %rax 2437 cmp %rax, %rbp 2438 ja .Lxts_enc_bzero 2439 2440 lea (%rbp),%rsp # restore %rsp 2441 ___ 2442 $code.=<<___ if ($win64); 2443 movaps 0x40(%rbp), %xmm6 2444 movaps 0x50(%rbp), %xmm7 2445 movaps 0x60(%rbp), %xmm8 2446 movaps 0x70(%rbp), %xmm9 2447 movaps 0x80(%rbp), %xmm10 2448 movaps 0x90(%rbp), %xmm11 2449 movaps 0xa0(%rbp), %xmm12 2450 movaps 0xb0(%rbp), %xmm13 2451 movaps 0xc0(%rbp), %xmm14 2452 movaps 0xd0(%rbp), %xmm15 2453 lea 0xa0(%rbp), %rsp 2454 ___ 2455 $code.=<<___; 2456 mov 0x48(%rsp), %r15 2457 mov 0x50(%rsp), %r14 2458 mov 0x58(%rsp), %r13 2459 mov 0x60(%rsp), %r12 2460 mov 0x68(%rsp), %rbx 2461 mov 0x70(%rsp), %rax 2462 lea 0x78(%rsp), %rsp 2463 mov %rax, %rbp 2464 .Lxts_enc_epilogue: 2465 ret 2466 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2467 2468 .globl bsaes_xts_decrypt 2469 .type bsaes_xts_decrypt,\@abi-omnipotent 2470 .align 16 2471 bsaes_xts_decrypt: 2472 mov %rsp, %rax 2473 .Lxts_dec_prologue: 2474 push %rbp 2475 push %rbx 2476 push %r12 2477 push %r13 2478 push %r14 2479 push %r15 2480 lea -0x48(%rsp), %rsp 2481 ___ 2482 $code.=<<___ if ($win64); 2483 mov 0xa0(%rsp),$arg5 # pull key2 2484 mov 0xa8(%rsp),$arg6 # pull ivp 2485 lea -0xa0(%rsp), %rsp 2486 movaps %xmm6, 0x40(%rsp) 2487 movaps %xmm7, 0x50(%rsp) 2488 movaps %xmm8, 0x60(%rsp) 2489 movaps %xmm9, 0x70(%rsp) 2490 movaps %xmm10, 0x80(%rsp) 2491 movaps %xmm11, 0x90(%rsp) 2492 movaps %xmm12, 0xa0(%rsp) 2493 movaps %xmm13, 0xb0(%rsp) 2494 movaps %xmm14, 0xc0(%rsp) 2495 movaps %xmm15, 0xd0(%rsp) 2496 .Lxts_dec_body: 2497 ___ 2498 $code.=<<___; 2499 mov %rsp, %rbp # backup %rsp 2500 mov $arg1, $inp # backup arguments 2501 mov $arg2, $out 2502 mov $arg3, $len 2503 mov $arg4, $key 2504 2505 lea ($arg6), $arg1 2506 lea 0x20(%rbp), $arg2 2507 lea ($arg5), $arg3 2508 call asm_AES_encrypt # generate initial tweak 2509 2510 mov 240($key), %eax # rounds 2511 mov $len, %rbx # backup $len 2512 2513 mov %eax, %edx # rounds 2514 shl \$7, %rax # 128 bytes per inner round key 2515 sub \$`128-32`, %rax # size of bit-sliced key schedule 2516 sub %rax, %rsp 2517 2518 mov %rsp, %rax # pass key schedule 2519 mov $key, %rcx # pass key 2520 mov %edx, %r10d # pass rounds 2521 call _bsaes_key_convert 2522 pxor (%rsp), %xmm7 # fix up round 0 key 2523 movdqa %xmm6, (%rax) # save last round key 2524 movdqa %xmm7, (%rsp) 2525 2526 xor %eax, %eax # if ($len%16) len-=16; 2527 and \$-16, $len 2528 test \$15, %ebx 2529 setnz %al 2530 shl \$4, %rax 2531 sub %rax, $len 2532 2533 sub \$0x80, %rsp # place for tweak[8] 2534 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2535 2536 pxor $twtmp, $twtmp 2537 movdqa .Lxts_magic(%rip), $twmask 2538 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2539 2540 sub \$0x80, $len 2541 jc .Lxts_dec_short 2542 jmp .Lxts_dec_loop 2543 2544 .align 16 2545 .Lxts_dec_loop: 2546 ___ 2547 for ($i=0;$i<7;$i++) { 2548 $code.=<<___; 2549 pshufd \$0x13, $twtmp, $twres 2550 pxor $twtmp, $twtmp 2551 movdqa @XMM[7], @XMM[$i] 2552 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2553 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2554 pand $twmask, $twres # isolate carry and residue 2555 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2556 pxor $twres, @XMM[7] 2557 ___ 2558 $code.=<<___ if ($i>=1); 2559 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2560 ___ 2561 $code.=<<___ if ($i>=2); 2562 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2563 ___ 2564 } 2565 $code.=<<___; 2566 movdqu 0x60($inp), @XMM[8+6] 2567 pxor @XMM[8+5], @XMM[5] 2568 movdqu 0x70($inp), @XMM[8+7] 2569 lea 0x80($inp), $inp 2570 movdqa @XMM[7], 0x70(%rsp) 2571 pxor @XMM[8+6], @XMM[6] 2572 lea 0x80(%rsp), %rax # pass key schedule 2573 pxor @XMM[8+7], @XMM[7] 2574 mov %edx, %r10d # pass rounds 2575 2576 call _bsaes_decrypt8 2577 2578 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2579 pxor 0x10(%rsp), @XMM[1] 2580 movdqu @XMM[0], 0x00($out) # write output 2581 pxor 0x20(%rsp), @XMM[6] 2582 movdqu @XMM[1], 0x10($out) 2583 pxor 0x30(%rsp), @XMM[4] 2584 movdqu @XMM[6], 0x20($out) 2585 pxor 0x40(%rsp), @XMM[2] 2586 movdqu @XMM[4], 0x30($out) 2587 pxor 0x50(%rsp), @XMM[7] 2588 movdqu @XMM[2], 0x40($out) 2589 pxor 0x60(%rsp), @XMM[3] 2590 movdqu @XMM[7], 0x50($out) 2591 pxor 0x70(%rsp), @XMM[5] 2592 movdqu @XMM[3], 0x60($out) 2593 movdqu @XMM[5], 0x70($out) 2594 lea 0x80($out), $out 2595 2596 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2597 pxor $twtmp, $twtmp 2598 movdqa .Lxts_magic(%rip), $twmask 2599 pcmpgtd @XMM[7], $twtmp 2600 pshufd \$0x13, $twtmp, $twres 2601 pxor $twtmp, $twtmp 2602 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2603 pand $twmask, $twres # isolate carry and residue 2604 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2605 pxor $twres, @XMM[7] 2606 2607 sub \$0x80,$len 2608 jnc .Lxts_dec_loop 2609 2610 .Lxts_dec_short: 2611 add \$0x80, $len 2612 jz .Lxts_dec_done 2613 ___ 2614 for ($i=0;$i<7;$i++) { 2615 $code.=<<___; 2616 pshufd \$0x13, $twtmp, $twres 2617 pxor $twtmp, $twtmp 2618 movdqa @XMM[7], @XMM[$i] 2619 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2620 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2621 pand $twmask, $twres # isolate carry and residue 2622 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2623 pxor $twres, @XMM[7] 2624 ___ 2625 $code.=<<___ if ($i>=1); 2626 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2627 cmp \$`0x10*$i`,$len 2628 je .Lxts_dec_$i 2629 ___ 2630 $code.=<<___ if ($i>=2); 2631 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2632 ___ 2633 } 2634 $code.=<<___; 2635 movdqu 0x60($inp), @XMM[8+6] 2636 pxor @XMM[8+5], @XMM[5] 2637 movdqa @XMM[7], 0x70(%rsp) 2638 lea 0x70($inp), $inp 2639 pxor @XMM[8+6], @XMM[6] 2640 lea 0x80(%rsp), %rax # pass key schedule 2641 mov %edx, %r10d # pass rounds 2642 2643 call _bsaes_decrypt8 2644 2645 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2646 pxor 0x10(%rsp), @XMM[1] 2647 movdqu @XMM[0], 0x00($out) # write output 2648 pxor 0x20(%rsp), @XMM[6] 2649 movdqu @XMM[1], 0x10($out) 2650 pxor 0x30(%rsp), @XMM[4] 2651 movdqu @XMM[6], 0x20($out) 2652 pxor 0x40(%rsp), @XMM[2] 2653 movdqu @XMM[4], 0x30($out) 2654 pxor 0x50(%rsp), @XMM[7] 2655 movdqu @XMM[2], 0x40($out) 2656 pxor 0x60(%rsp), @XMM[3] 2657 movdqu @XMM[7], 0x50($out) 2658 movdqu @XMM[3], 0x60($out) 2659 lea 0x70($out), $out 2660 2661 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2662 jmp .Lxts_dec_done 2663 .align 16 2664 .Lxts_dec_6: 2665 pxor @XMM[8+4], @XMM[4] 2666 lea 0x60($inp), $inp 2667 pxor @XMM[8+5], @XMM[5] 2668 lea 0x80(%rsp), %rax # pass key schedule 2669 mov %edx, %r10d # pass rounds 2670 2671 call _bsaes_decrypt8 2672 2673 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2674 pxor 0x10(%rsp), @XMM[1] 2675 movdqu @XMM[0], 0x00($out) # write output 2676 pxor 0x20(%rsp), @XMM[6] 2677 movdqu @XMM[1], 0x10($out) 2678 pxor 0x30(%rsp), @XMM[4] 2679 movdqu @XMM[6], 0x20($out) 2680 pxor 0x40(%rsp), @XMM[2] 2681 movdqu @XMM[4], 0x30($out) 2682 pxor 0x50(%rsp), @XMM[7] 2683 movdqu @XMM[2], 0x40($out) 2684 movdqu @XMM[7], 0x50($out) 2685 lea 0x60($out), $out 2686 2687 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2688 jmp .Lxts_dec_done 2689 .align 16 2690 .Lxts_dec_5: 2691 pxor @XMM[8+3], @XMM[3] 2692 lea 0x50($inp), $inp 2693 pxor @XMM[8+4], @XMM[4] 2694 lea 0x80(%rsp), %rax # pass key schedule 2695 mov %edx, %r10d # pass rounds 2696 2697 call _bsaes_decrypt8 2698 2699 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2700 pxor 0x10(%rsp), @XMM[1] 2701 movdqu @XMM[0], 0x00($out) # write output 2702 pxor 0x20(%rsp), @XMM[6] 2703 movdqu @XMM[1], 0x10($out) 2704 pxor 0x30(%rsp), @XMM[4] 2705 movdqu @XMM[6], 0x20($out) 2706 pxor 0x40(%rsp), @XMM[2] 2707 movdqu @XMM[4], 0x30($out) 2708 movdqu @XMM[2], 0x40($out) 2709 lea 0x50($out), $out 2710 2711 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2712 jmp .Lxts_dec_done 2713 .align 16 2714 .Lxts_dec_4: 2715 pxor @XMM[8+2], @XMM[2] 2716 lea 0x40($inp), $inp 2717 pxor @XMM[8+3], @XMM[3] 2718 lea 0x80(%rsp), %rax # pass key schedule 2719 mov %edx, %r10d # pass rounds 2720 2721 call _bsaes_decrypt8 2722 2723 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2724 pxor 0x10(%rsp), @XMM[1] 2725 movdqu @XMM[0], 0x00($out) # write output 2726 pxor 0x20(%rsp), @XMM[6] 2727 movdqu @XMM[1], 0x10($out) 2728 pxor 0x30(%rsp), @XMM[4] 2729 movdqu @XMM[6], 0x20($out) 2730 movdqu @XMM[4], 0x30($out) 2731 lea 0x40($out), $out 2732 2733 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2734 jmp .Lxts_dec_done 2735 .align 16 2736 .Lxts_dec_3: 2737 pxor @XMM[8+1], @XMM[1] 2738 lea 0x30($inp), $inp 2739 pxor @XMM[8+2], @XMM[2] 2740 lea 0x80(%rsp), %rax # pass key schedule 2741 mov %edx, %r10d # pass rounds 2742 2743 call _bsaes_decrypt8 2744 2745 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2746 pxor 0x10(%rsp), @XMM[1] 2747 movdqu @XMM[0], 0x00($out) # write output 2748 pxor 0x20(%rsp), @XMM[6] 2749 movdqu @XMM[1], 0x10($out) 2750 movdqu @XMM[6], 0x20($out) 2751 lea 0x30($out), $out 2752 2753 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2754 jmp .Lxts_dec_done 2755 .align 16 2756 .Lxts_dec_2: 2757 pxor @XMM[8+0], @XMM[0] 2758 lea 0x20($inp), $inp 2759 pxor @XMM[8+1], @XMM[1] 2760 lea 0x80(%rsp), %rax # pass key schedule 2761 mov %edx, %r10d # pass rounds 2762 2763 call _bsaes_decrypt8 2764 2765 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2766 pxor 0x10(%rsp), @XMM[1] 2767 movdqu @XMM[0], 0x00($out) # write output 2768 movdqu @XMM[1], 0x10($out) 2769 lea 0x20($out), $out 2770 2771 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2772 jmp .Lxts_dec_done 2773 .align 16 2774 .Lxts_dec_1: 2775 pxor @XMM[0], @XMM[8] 2776 lea 0x10($inp), $inp 2777 movdqa @XMM[8], 0x20(%rbp) 2778 lea 0x20(%rbp), $arg1 2779 lea 0x20(%rbp), $arg2 2780 lea ($key), $arg3 2781 call asm_AES_decrypt # doesn't touch %xmm 2782 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2783 #pxor @XMM[8], @XMM[0] 2784 #lea 0x80(%rsp), %rax # pass key schedule 2785 #mov %edx, %r10d # pass rounds 2786 #call _bsaes_decrypt8 2787 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2788 movdqu @XMM[0], 0x00($out) # write output 2789 lea 0x10($out), $out 2790 2791 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2792 2793 .Lxts_dec_done: 2794 and \$15, %ebx 2795 jz .Lxts_dec_ret 2796 2797 pxor $twtmp, $twtmp 2798 movdqa .Lxts_magic(%rip), $twmask 2799 pcmpgtd @XMM[7], $twtmp 2800 pshufd \$0x13, $twtmp, $twres 2801 movdqa @XMM[7], @XMM[6] 2802 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2803 pand $twmask, $twres # isolate carry and residue 2804 movdqu ($inp), @XMM[0] 2805 pxor $twres, @XMM[7] 2806 2807 lea 0x20(%rbp), $arg1 2808 pxor @XMM[7], @XMM[0] 2809 lea 0x20(%rbp), $arg2 2810 movdqa @XMM[0], 0x20(%rbp) 2811 lea ($key), $arg3 2812 call asm_AES_decrypt # doesn't touch %xmm 2813 pxor 0x20(%rbp), @XMM[7] 2814 mov $out, %rdx 2815 movdqu @XMM[7], ($out) 2816 2817 .Lxts_dec_steal: 2818 movzb 16($inp), %eax 2819 movzb (%rdx), %ecx 2820 lea 1($inp), $inp 2821 mov %al, (%rdx) 2822 mov %cl, 16(%rdx) 2823 lea 1(%rdx), %rdx 2824 sub \$1,%ebx 2825 jnz .Lxts_dec_steal 2826 2827 movdqu ($out), @XMM[0] 2828 lea 0x20(%rbp), $arg1 2829 pxor @XMM[6], @XMM[0] 2830 lea 0x20(%rbp), $arg2 2831 movdqa @XMM[0], 0x20(%rbp) 2832 lea ($key), $arg3 2833 call asm_AES_decrypt # doesn't touch %xmm 2834 pxor 0x20(%rbp), @XMM[6] 2835 movdqu @XMM[6], ($out) 2836 2837 .Lxts_dec_ret: 2838 lea (%rsp), %rax 2839 pxor %xmm0, %xmm0 2840 .Lxts_dec_bzero: # wipe key schedule [if any] 2841 movdqa %xmm0, 0x00(%rax) 2842 movdqa %xmm0, 0x10(%rax) 2843 lea 0x20(%rax), %rax 2844 cmp %rax, %rbp 2845 ja .Lxts_dec_bzero 2846 2847 lea (%rbp),%rsp # restore %rsp 2848 ___ 2849 $code.=<<___ if ($win64); 2850 movaps 0x40(%rbp), %xmm6 2851 movaps 0x50(%rbp), %xmm7 2852 movaps 0x60(%rbp), %xmm8 2853 movaps 0x70(%rbp), %xmm9 2854 movaps 0x80(%rbp), %xmm10 2855 movaps 0x90(%rbp), %xmm11 2856 movaps 0xa0(%rbp), %xmm12 2857 movaps 0xb0(%rbp), %xmm13 2858 movaps 0xc0(%rbp), %xmm14 2859 movaps 0xd0(%rbp), %xmm15 2860 lea 0xa0(%rbp), %rsp 2861 ___ 2862 $code.=<<___; 2863 mov 0x48(%rsp), %r15 2864 mov 0x50(%rsp), %r14 2865 mov 0x58(%rsp), %r13 2866 mov 0x60(%rsp), %r12 2867 mov 0x68(%rsp), %rbx 2868 mov 0x70(%rsp), %rax 2869 lea 0x78(%rsp), %rsp 2870 mov %rax, %rbp 2871 .Lxts_dec_epilogue: 2872 ret 2873 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2874 ___ 2875 } 2876 $code.=<<___; 2877 .type _bsaes_const,\@object 2878 .align 64 2879 _bsaes_const: 2880 .LM0ISR: # InvShiftRows constants 2881 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2882 .LISRM0: 2883 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2884 .LISR: 2885 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2886 .LBS0: # bit-slice constants 2887 .quad 0x5555555555555555, 0x5555555555555555 2888 .LBS1: 2889 .quad 0x3333333333333333, 0x3333333333333333 2890 .LBS2: 2891 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2892 .LSR: # shiftrows constants 2893 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2894 .LSRM0: 2895 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2896 .LM0SR: 2897 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2898 .LSWPUP: # byte-swap upper dword 2899 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2900 .LSWPUPM0SR: 2901 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2902 .LADD1: # counter increment constants 2903 .quad 0x0000000000000000, 0x0000000100000000 2904 .LADD2: 2905 .quad 0x0000000000000000, 0x0000000200000000 2906 .LADD3: 2907 .quad 0x0000000000000000, 0x0000000300000000 2908 .LADD4: 2909 .quad 0x0000000000000000, 0x0000000400000000 2910 .LADD5: 2911 .quad 0x0000000000000000, 0x0000000500000000 2912 .LADD6: 2913 .quad 0x0000000000000000, 0x0000000600000000 2914 .LADD7: 2915 .quad 0x0000000000000000, 0x0000000700000000 2916 .LADD8: 2917 .quad 0x0000000000000000, 0x0000000800000000 2918 .Lxts_magic: 2919 .long 0x87,0,1,0 2920 .Lmasks: 2921 .quad 0x0101010101010101, 0x0101010101010101 2922 .quad 0x0202020202020202, 0x0202020202020202 2923 .quad 0x0404040404040404, 0x0404040404040404 2924 .quad 0x0808080808080808, 0x0808080808080808 2925 .LM0: 2926 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2927 .L63: 2928 .quad 0x6363636363636363, 0x6363636363636363 2929 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Ksper, Peter Schwabe, Andy Polyakov" 2930 .align 64 2931 .size _bsaes_const,.-_bsaes_const 2932 ___ 2933 2934 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2935 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 2936 if ($win64) { 2937 $rec="%rcx"; 2938 $frame="%rdx"; 2939 $context="%r8"; 2940 $disp="%r9"; 2941 2942 $code.=<<___; 2943 .extern __imp_RtlVirtualUnwind 2944 .type se_handler,\@abi-omnipotent 2945 .align 16 2946 se_handler: 2947 push %rsi 2948 push %rdi 2949 push %rbx 2950 push %rbp 2951 push %r12 2952 push %r13 2953 push %r14 2954 push %r15 2955 pushfq 2956 sub \$64,%rsp 2957 2958 mov 120($context),%rax # pull context->Rax 2959 mov 248($context),%rbx # pull context->Rip 2960 2961 mov 8($disp),%rsi # disp->ImageBase 2962 mov 56($disp),%r11 # disp->HandlerData 2963 2964 mov 0(%r11),%r10d # HandlerData[0] 2965 lea (%rsi,%r10),%r10 # prologue label 2966 cmp %r10,%rbx # context->Rip<prologue label 2967 jb .Lin_prologue 2968 2969 mov 152($context),%rax # pull context->Rsp 2970 2971 mov 4(%r11),%r10d # HandlerData[1] 2972 lea (%rsi,%r10),%r10 # epilogue label 2973 cmp %r10,%rbx # context->Rip>=epilogue label 2974 jae .Lin_prologue 2975 2976 mov 160($context),%rax # pull context->Rbp 2977 2978 lea 0x40(%rax),%rsi # %xmm save area 2979 lea 512($context),%rdi # &context.Xmm6 2980 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2981 .long 0xa548f3fc # cld; rep movsq 2982 lea 0xa0(%rax),%rax # adjust stack pointer 2983 2984 mov 0x70(%rax),%rbp 2985 mov 0x68(%rax),%rbx 2986 mov 0x60(%rax),%r12 2987 mov 0x58(%rax),%r13 2988 mov 0x50(%rax),%r14 2989 mov 0x48(%rax),%r15 2990 lea 0x78(%rax),%rax # adjust stack pointer 2991 mov %rbx,144($context) # restore context->Rbx 2992 mov %rbp,160($context) # restore context->Rbp 2993 mov %r12,216($context) # restore context->R12 2994 mov %r13,224($context) # restore context->R13 2995 mov %r14,232($context) # restore context->R14 2996 mov %r15,240($context) # restore context->R15 2997 2998 .Lin_prologue: 2999 mov %rax,152($context) # restore context->Rsp 3000 3001 mov 40($disp),%rdi # disp->ContextRecord 3002 mov $context,%rsi # context 3003 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3004 .long 0xa548f3fc # cld; rep movsq 3005 3006 mov $disp,%rsi 3007 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3008 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3009 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3010 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3011 mov 40(%rsi),%r10 # disp->ContextRecord 3012 lea 56(%rsi),%r11 # &disp->HandlerData 3013 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3014 mov %r10,32(%rsp) # arg5 3015 mov %r11,40(%rsp) # arg6 3016 mov %r12,48(%rsp) # arg7 3017 mov %rcx,56(%rsp) # arg8, (NULL) 3018 call *__imp_RtlVirtualUnwind(%rip) 3019 3020 mov \$1,%eax # ExceptionContinueSearch 3021 add \$64,%rsp 3022 popfq 3023 pop %r15 3024 pop %r14 3025 pop %r13 3026 pop %r12 3027 pop %rbp 3028 pop %rbx 3029 pop %rdi 3030 pop %rsi 3031 ret 3032 .size se_handler,.-se_handler 3033 3034 .section .pdata 3035 .align 4 3036 ___ 3037 $code.=<<___ if ($ecb); 3038 .rva .Lecb_enc_prologue 3039 .rva .Lecb_enc_epilogue 3040 .rva .Lecb_enc_info 3041 3042 .rva .Lecb_dec_prologue 3043 .rva .Lecb_dec_epilogue 3044 .rva .Lecb_dec_info 3045 ___ 3046 $code.=<<___; 3047 .rva .Lcbc_dec_prologue 3048 .rva .Lcbc_dec_epilogue 3049 .rva .Lcbc_dec_info 3050 3051 .rva .Lctr_enc_prologue 3052 .rva .Lctr_enc_epilogue 3053 .rva .Lctr_enc_info 3054 3055 .rva .Lxts_enc_prologue 3056 .rva .Lxts_enc_epilogue 3057 .rva .Lxts_enc_info 3058 3059 .rva .Lxts_dec_prologue 3060 .rva .Lxts_dec_epilogue 3061 .rva .Lxts_dec_info 3062 3063 .section .xdata 3064 .align 8 3065 ___ 3066 $code.=<<___ if ($ecb); 3067 .Lecb_enc_info: 3068 .byte 9,0,0,0 3069 .rva se_handler 3070 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3071 .Lecb_dec_info: 3072 .byte 9,0,0,0 3073 .rva se_handler 3074 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3075 ___ 3076 $code.=<<___; 3077 .Lcbc_dec_info: 3078 .byte 9,0,0,0 3079 .rva se_handler 3080 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3081 .Lctr_enc_info: 3082 .byte 9,0,0,0 3083 .rva se_handler 3084 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3085 .Lxts_enc_info: 3086 .byte 9,0,0,0 3087 .rva se_handler 3088 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3089 .Lxts_dec_info: 3090 .byte 9,0,0,0 3091 .rva se_handler 3092 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3093 ___ 3094 } 3095 3096 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 3097 3098 print $code; 3099 3100 close STDOUT; 3101