1 #!/usr/bin/env perl 2 3 ################################################################### 4 ### AES-128 [originally in CTR mode] ### 5 ### bitsliced implementation for Intel Core 2 processors ### 6 ### requires support of SSE extensions up to SSSE3 ### 7 ### Author: Emilia Ksper and Peter Schwabe ### 8 ### Date: 2009-03-19 ### 9 ### Public domain ### 10 ### ### 11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12 ### further information. ### 13 ################################################################### 14 # 15 # September 2011. 16 # 17 # Started as transliteration to "perlasm" the original code has 18 # undergone following changes: 19 # 20 # - code was made position-independent; 21 # - rounds were folded into a loop resulting in >5x size reduction 22 # from 12.5KB to 2.2KB; 23 # - above was possibile thanks to mixcolumns() modification that 24 # allowed to feed its output back to aesenc[last], this was 25 # achieved at cost of two additional inter-registers moves; 26 # - some instruction reordering and interleaving; 27 # - this module doesn't implement key setup subroutine, instead it 28 # relies on conversion of "conventional" key schedule as returned 29 # by AES_set_encrypt_key (see discussion below); 30 # - first and last round keys are treated differently, which allowed 31 # to skip one shiftrows(), reduce bit-sliced key schedule and 32 # speed-up conversion by 22%; 33 # - support for 192- and 256-bit keys was added; 34 # 35 # Resulting performance in CPU cycles spent to encrypt one byte out 36 # of 4096-byte buffer with 128-bit key is: 37 # 38 # Emilia's this(*) difference 39 # 40 # Core 2 9.30 8.69 +7% 41 # Nehalem(**) 7.63 6.88 +11% 42 # Atom 17.1 16.4 +4% 43 # Silvermont - 12.9 44 # 45 # (*) Comparison is not completely fair, because "this" is ECB, 46 # i.e. no extra processing such as counter values calculation 47 # and xor-ing input as in Emilia's CTR implementation is 48 # performed. However, the CTR calculations stand for not more 49 # than 1% of total time, so comparison is *rather* fair. 50 # 51 # (**) Results were collected on Westmere, which is considered to 52 # be equivalent to Nehalem for this code. 53 # 54 # As for key schedule conversion subroutine. Interface to OpenSSL 55 # relies on per-invocation on-the-fly conversion. This naturally 56 # has impact on performance, especially for short inputs. Conversion 57 # time in CPU cycles and its ratio to CPU cycles spent in 8x block 58 # function is: 59 # 60 # conversion conversion/8x block 61 # Core 2 240 0.22 62 # Nehalem 180 0.20 63 # Atom 430 0.20 64 # 65 # The ratio values mean that 128-byte blocks will be processed 66 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 67 # etc. Then keep in mind that input sizes not divisible by 128 are 68 # *effectively* slower, especially shortest ones, e.g. consecutive 69 # 144-byte blocks are processed 44% slower than one would expect, 70 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 71 # it's still faster than ["hyper-threading-safe" code path in] 72 # aes-x86_64.pl on all lengths above 64 bytes... 73 # 74 # October 2011. 75 # 76 # Add decryption procedure. Performance in CPU cycles spent to decrypt 77 # one byte out of 4096-byte buffer with 128-bit key is: 78 # 79 # Core 2 9.98 80 # Nehalem 7.80 81 # Atom 17.9 82 # Silvermont 14.0 83 # 84 # November 2011. 85 # 86 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 87 # suboptimal, but XTS is meant to be used with larger blocks... 88 # 89 # <appro (at] openssl.org> 90 91 $flavour = shift; 92 $output = shift; 93 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 94 95 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 96 97 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 98 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 99 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 100 die "can't locate x86_64-xlate.pl"; 101 102 open OUT,"| \"$^X\" $xlate $flavour $output"; 103 *STDOUT=*OUT; 104 105 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 106 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 107 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 108 109 { 110 my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 111 112 sub Sbox { 113 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 114 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 115 my @b=@_[0..7]; 116 my @t=@_[8..11]; 117 my @s=@_[12..15]; 118 &InBasisChange (@b); 119 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 120 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 121 } 122 123 sub InBasisChange { 124 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 125 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 126 my @b=@_[0..7]; 127 $code.=<<___; 128 pxor @b[6], @b[5] 129 pxor @b[1], @b[2] 130 pxor @b[0], @b[3] 131 pxor @b[2], @b[6] 132 pxor @b[0], @b[5] 133 134 pxor @b[3], @b[6] 135 pxor @b[7], @b[3] 136 pxor @b[5], @b[7] 137 pxor @b[4], @b[3] 138 pxor @b[5], @b[4] 139 pxor @b[1], @b[3] 140 141 pxor @b[7], @b[2] 142 pxor @b[5], @b[1] 143 ___ 144 } 145 146 sub OutBasisChange { 147 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 148 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 149 my @b=@_[0..7]; 150 $code.=<<___; 151 pxor @b[6], @b[0] 152 pxor @b[4], @b[1] 153 pxor @b[0], @b[2] 154 pxor @b[6], @b[4] 155 pxor @b[1], @b[6] 156 157 pxor @b[5], @b[1] 158 pxor @b[3], @b[5] 159 pxor @b[7], @b[3] 160 pxor @b[5], @b[7] 161 pxor @b[5], @b[2] 162 163 pxor @b[7], @b[4] 164 ___ 165 } 166 167 sub InvSbox { 168 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 169 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 170 my @b=@_[0..7]; 171 my @t=@_[8..11]; 172 my @s=@_[12..15]; 173 &InvInBasisChange (@b); 174 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 175 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 176 } 177 178 sub InvInBasisChange { # OutBasisChange in reverse 179 my @b=@_[5,1,2,6,3,7,0,4]; 180 $code.=<<___ 181 pxor @b[7], @b[4] 182 183 pxor @b[5], @b[7] 184 pxor @b[5], @b[2] 185 pxor @b[7], @b[3] 186 pxor @b[3], @b[5] 187 pxor @b[5], @b[1] 188 189 pxor @b[1], @b[6] 190 pxor @b[0], @b[2] 191 pxor @b[6], @b[4] 192 pxor @b[6], @b[0] 193 pxor @b[4], @b[1] 194 ___ 195 } 196 197 sub InvOutBasisChange { # InBasisChange in reverse 198 my @b=@_[2,5,7,3,6,1,0,4]; 199 $code.=<<___; 200 pxor @b[5], @b[1] 201 pxor @b[7], @b[2] 202 203 pxor @b[1], @b[3] 204 pxor @b[5], @b[4] 205 pxor @b[5], @b[7] 206 pxor @b[4], @b[3] 207 pxor @b[0], @b[5] 208 pxor @b[7], @b[3] 209 pxor @b[2], @b[6] 210 pxor @b[1], @b[2] 211 pxor @b[3], @b[6] 212 213 pxor @b[0], @b[3] 214 pxor @b[6], @b[5] 215 ___ 216 } 217 218 sub Mul_GF4 { 219 #;************************************************************* 220 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 221 #;************************************************************* 222 my ($x0,$x1,$y0,$y1,$t0)=@_; 223 $code.=<<___; 224 movdqa $y0, $t0 225 pxor $y1, $t0 226 pand $x0, $t0 227 pxor $x1, $x0 228 pand $y0, $x1 229 pand $y1, $x0 230 pxor $x1, $x0 231 pxor $t0, $x1 232 ___ 233 } 234 235 sub Mul_GF4_N { # not used, see next subroutine 236 # multiply and scale by N 237 my ($x0,$x1,$y0,$y1,$t0)=@_; 238 $code.=<<___; 239 movdqa $y0, $t0 240 pxor $y1, $t0 241 pand $x0, $t0 242 pxor $x1, $x0 243 pand $y0, $x1 244 pand $y1, $x0 245 pxor $x0, $x1 246 pxor $t0, $x0 247 ___ 248 } 249 250 sub Mul_GF4_N_GF4 { 251 # interleaved Mul_GF4_N and Mul_GF4 252 my ($x0,$x1,$y0,$y1,$t0, 253 $x2,$x3,$y2,$y3,$t1)=@_; 254 $code.=<<___; 255 movdqa $y0, $t0 256 movdqa $y2, $t1 257 pxor $y1, $t0 258 pxor $y3, $t1 259 pand $x0, $t0 260 pand $x2, $t1 261 pxor $x1, $x0 262 pxor $x3, $x2 263 pand $y0, $x1 264 pand $y2, $x3 265 pand $y1, $x0 266 pand $y3, $x2 267 pxor $x0, $x1 268 pxor $x3, $x2 269 pxor $t0, $x0 270 pxor $t1, $x3 271 ___ 272 } 273 sub Mul_GF16_2 { 274 my @x=@_[0..7]; 275 my @y=@_[8..11]; 276 my @t=@_[12..15]; 277 $code.=<<___; 278 movdqa @x[0], @t[0] 279 movdqa @x[1], @t[1] 280 ___ 281 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 282 $code.=<<___; 283 pxor @x[2], @t[0] 284 pxor @x[3], @t[1] 285 pxor @y[2], @y[0] 286 pxor @y[3], @y[1] 287 ___ 288 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 289 @x[2], @x[3], @y[2], @y[3], @t[2]); 290 $code.=<<___; 291 pxor @t[0], @x[0] 292 pxor @t[0], @x[2] 293 pxor @t[1], @x[1] 294 pxor @t[1], @x[3] 295 296 movdqa @x[4], @t[0] 297 movdqa @x[5], @t[1] 298 pxor @x[6], @t[0] 299 pxor @x[7], @t[1] 300 ___ 301 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 302 @x[6], @x[7], @y[2], @y[3], @t[2]); 303 $code.=<<___; 304 pxor @y[2], @y[0] 305 pxor @y[3], @y[1] 306 ___ 307 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 308 $code.=<<___; 309 pxor @t[0], @x[4] 310 pxor @t[0], @x[6] 311 pxor @t[1], @x[5] 312 pxor @t[1], @x[7] 313 ___ 314 } 315 sub Inv_GF256 { 316 #;******************************************************************** 317 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 318 #;******************************************************************** 319 my @x=@_[0..7]; 320 my @t=@_[8..11]; 321 my @s=@_[12..15]; 322 # direct optimizations from hardware 323 $code.=<<___; 324 movdqa @x[4], @t[3] 325 movdqa @x[5], @t[2] 326 movdqa @x[1], @t[1] 327 movdqa @x[7], @s[1] 328 movdqa @x[0], @s[0] 329 330 pxor @x[6], @t[3] 331 pxor @x[7], @t[2] 332 pxor @x[3], @t[1] 333 movdqa @t[3], @s[2] 334 pxor @x[6], @s[1] 335 movdqa @t[2], @t[0] 336 pxor @x[2], @s[0] 337 movdqa @t[3], @s[3] 338 339 por @t[1], @t[2] 340 por @s[0], @t[3] 341 pxor @t[0], @s[3] 342 pand @s[0], @s[2] 343 pxor @t[1], @s[0] 344 pand @t[1], @t[0] 345 pand @s[0], @s[3] 346 movdqa @x[3], @s[0] 347 pxor @x[2], @s[0] 348 pand @s[0], @s[1] 349 pxor @s[1], @t[3] 350 pxor @s[1], @t[2] 351 movdqa @x[4], @s[1] 352 movdqa @x[1], @s[0] 353 pxor @x[5], @s[1] 354 pxor @x[0], @s[0] 355 movdqa @s[1], @t[1] 356 pand @s[0], @s[1] 357 por @s[0], @t[1] 358 pxor @s[1], @t[0] 359 pxor @s[3], @t[3] 360 pxor @s[2], @t[2] 361 pxor @s[3], @t[1] 362 movdqa @x[7], @s[0] 363 pxor @s[2], @t[0] 364 movdqa @x[6], @s[1] 365 pxor @s[2], @t[1] 366 movdqa @x[5], @s[2] 367 pand @x[3], @s[0] 368 movdqa @x[4], @s[3] 369 pand @x[2], @s[1] 370 pand @x[1], @s[2] 371 por @x[0], @s[3] 372 pxor @s[0], @t[3] 373 pxor @s[1], @t[2] 374 pxor @s[2], @t[1] 375 pxor @s[3], @t[0] 376 377 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 378 379 # new smaller inversion 380 381 movdqa @t[3], @s[0] 382 pand @t[1], @t[3] 383 pxor @t[2], @s[0] 384 385 movdqa @t[0], @s[2] 386 movdqa @s[0], @s[3] 387 pxor @t[3], @s[2] 388 pand @s[2], @s[3] 389 390 movdqa @t[1], @s[1] 391 pxor @t[2], @s[3] 392 pxor @t[0], @s[1] 393 394 pxor @t[2], @t[3] 395 396 pand @t[3], @s[1] 397 398 movdqa @s[2], @t[2] 399 pxor @t[0], @s[1] 400 401 pxor @s[1], @t[2] 402 pxor @s[1], @t[1] 403 404 pand @t[0], @t[2] 405 406 pxor @t[2], @s[2] 407 pxor @t[2], @t[1] 408 409 pand @s[3], @s[2] 410 411 pxor @s[0], @s[2] 412 ___ 413 # output in s3, s2, s1, t1 414 415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 416 417 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 418 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 419 420 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 421 } 422 423 # AES linear components 424 425 sub ShiftRows { 426 my @x=@_[0..7]; 427 my $mask=pop; 428 $code.=<<___; 429 pxor 0x00($key),@x[0] 430 pxor 0x10($key),@x[1] 431 pxor 0x20($key),@x[2] 432 pxor 0x30($key),@x[3] 433 pshufb $mask,@x[0] 434 pshufb $mask,@x[1] 435 pxor 0x40($key),@x[4] 436 pxor 0x50($key),@x[5] 437 pshufb $mask,@x[2] 438 pshufb $mask,@x[3] 439 pxor 0x60($key),@x[6] 440 pxor 0x70($key),@x[7] 441 pshufb $mask,@x[4] 442 pshufb $mask,@x[5] 443 pshufb $mask,@x[6] 444 pshufb $mask,@x[7] 445 lea 0x80($key),$key 446 ___ 447 } 448 449 sub MixColumns { 450 # modified to emit output in order suitable for feeding back to aesenc[last] 451 my @x=@_[0..7]; 452 my @t=@_[8..15]; 453 my $inv=@_[16]; # optional 454 $code.=<<___; 455 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 456 pshufd \$0x93, @x[1], @t[1] 457 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 458 pshufd \$0x93, @x[2], @t[2] 459 pxor @t[1], @x[1] 460 pshufd \$0x93, @x[3], @t[3] 461 pxor @t[2], @x[2] 462 pshufd \$0x93, @x[4], @t[4] 463 pxor @t[3], @x[3] 464 pshufd \$0x93, @x[5], @t[5] 465 pxor @t[4], @x[4] 466 pshufd \$0x93, @x[6], @t[6] 467 pxor @t[5], @x[5] 468 pshufd \$0x93, @x[7], @t[7] 469 pxor @t[6], @x[6] 470 pxor @t[7], @x[7] 471 472 pxor @x[0], @t[1] 473 pxor @x[7], @t[0] 474 pxor @x[7], @t[1] 475 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 476 pxor @x[1], @t[2] 477 pshufd \$0x4E, @x[1], @x[1] 478 pxor @x[4], @t[5] 479 pxor @t[0], @x[0] 480 pxor @x[5], @t[6] 481 pxor @t[1], @x[1] 482 pxor @x[3], @t[4] 483 pshufd \$0x4E, @x[4], @t[0] 484 pxor @x[6], @t[7] 485 pshufd \$0x4E, @x[5], @t[1] 486 pxor @x[2], @t[3] 487 pshufd \$0x4E, @x[3], @x[4] 488 pxor @x[7], @t[3] 489 pshufd \$0x4E, @x[7], @x[5] 490 pxor @x[7], @t[4] 491 pshufd \$0x4E, @x[6], @x[3] 492 pxor @t[4], @t[0] 493 pshufd \$0x4E, @x[2], @x[6] 494 pxor @t[5], @t[1] 495 ___ 496 $code.=<<___ if (!$inv); 497 pxor @t[3], @x[4] 498 pxor @t[7], @x[5] 499 pxor @t[6], @x[3] 500 movdqa @t[0], @x[2] 501 pxor @t[2], @x[6] 502 movdqa @t[1], @x[7] 503 ___ 504 $code.=<<___ if ($inv); 505 pxor @x[4], @t[3] 506 pxor @t[7], @x[5] 507 pxor @x[3], @t[6] 508 movdqa @t[0], @x[3] 509 pxor @t[2], @x[6] 510 movdqa @t[6], @x[2] 511 movdqa @t[1], @x[7] 512 movdqa @x[6], @x[4] 513 movdqa @t[3], @x[6] 514 ___ 515 } 516 517 sub InvMixColumns_orig { 518 my @x=@_[0..7]; 519 my @t=@_[8..15]; 520 521 $code.=<<___; 522 # multiplication by 0x0e 523 pshufd \$0x93, @x[7], @t[7] 524 movdqa @x[2], @t[2] 525 pxor @x[5], @x[7] # 7 5 526 pxor @x[5], @x[2] # 2 5 527 pshufd \$0x93, @x[0], @t[0] 528 movdqa @x[5], @t[5] 529 pxor @x[0], @x[5] # 5 0 [1] 530 pxor @x[1], @x[0] # 0 1 531 pshufd \$0x93, @x[1], @t[1] 532 pxor @x[2], @x[1] # 1 25 533 pxor @x[6], @x[0] # 01 6 [2] 534 pxor @x[3], @x[1] # 125 3 [4] 535 pshufd \$0x93, @x[3], @t[3] 536 pxor @x[0], @x[2] # 25 016 [3] 537 pxor @x[7], @x[3] # 3 75 538 pxor @x[6], @x[7] # 75 6 [0] 539 pshufd \$0x93, @x[6], @t[6] 540 movdqa @x[4], @t[4] 541 pxor @x[4], @x[6] # 6 4 542 pxor @x[3], @x[4] # 4 375 [6] 543 pxor @x[7], @x[3] # 375 756=36 544 pxor @t[5], @x[6] # 64 5 [7] 545 pxor @t[2], @x[3] # 36 2 546 pxor @t[4], @x[3] # 362 4 [5] 547 pshufd \$0x93, @t[5], @t[5] 548 ___ 549 my @y = @x[7,5,0,2,1,3,4,6]; 550 $code.=<<___; 551 # multiplication by 0x0b 552 pxor @y[0], @y[1] 553 pxor @t[0], @y[0] 554 pxor @t[1], @y[1] 555 pshufd \$0x93, @t[2], @t[2] 556 pxor @t[5], @y[0] 557 pxor @t[6], @y[1] 558 pxor @t[7], @y[0] 559 pshufd \$0x93, @t[4], @t[4] 560 pxor @t[6], @t[7] # clobber t[7] 561 pxor @y[0], @y[1] 562 563 pxor @t[0], @y[3] 564 pshufd \$0x93, @t[0], @t[0] 565 pxor @t[1], @y[2] 566 pxor @t[1], @y[4] 567 pxor @t[2], @y[2] 568 pshufd \$0x93, @t[1], @t[1] 569 pxor @t[2], @y[3] 570 pxor @t[2], @y[5] 571 pxor @t[7], @y[2] 572 pshufd \$0x93, @t[2], @t[2] 573 pxor @t[3], @y[3] 574 pxor @t[3], @y[6] 575 pxor @t[3], @y[4] 576 pshufd \$0x93, @t[3], @t[3] 577 pxor @t[4], @y[7] 578 pxor @t[4], @y[5] 579 pxor @t[7], @y[7] 580 pxor @t[5], @y[3] 581 pxor @t[4], @y[4] 582 pxor @t[5], @t[7] # clobber t[7] even more 583 584 pxor @t[7], @y[5] 585 pshufd \$0x93, @t[4], @t[4] 586 pxor @t[7], @y[6] 587 pxor @t[7], @y[4] 588 589 pxor @t[5], @t[7] 590 pshufd \$0x93, @t[5], @t[5] 591 pxor @t[6], @t[7] # restore t[7] 592 593 # multiplication by 0x0d 594 pxor @y[7], @y[4] 595 pxor @t[4], @y[7] 596 pshufd \$0x93, @t[6], @t[6] 597 pxor @t[0], @y[2] 598 pxor @t[5], @y[7] 599 pxor @t[2], @y[2] 600 pshufd \$0x93, @t[7], @t[7] 601 602 pxor @y[1], @y[3] 603 pxor @t[1], @y[1] 604 pxor @t[0], @y[0] 605 pxor @t[0], @y[3] 606 pxor @t[5], @y[1] 607 pxor @t[5], @y[0] 608 pxor @t[7], @y[1] 609 pshufd \$0x93, @t[0], @t[0] 610 pxor @t[6], @y[0] 611 pxor @y[1], @y[3] 612 pxor @t[1], @y[4] 613 pshufd \$0x93, @t[1], @t[1] 614 615 pxor @t[7], @y[7] 616 pxor @t[2], @y[4] 617 pxor @t[2], @y[5] 618 pshufd \$0x93, @t[2], @t[2] 619 pxor @t[6], @y[2] 620 pxor @t[3], @t[6] # clobber t[6] 621 pxor @y[7], @y[4] 622 pxor @t[6], @y[3] 623 624 pxor @t[6], @y[6] 625 pxor @t[5], @y[5] 626 pxor @t[4], @y[6] 627 pshufd \$0x93, @t[4], @t[4] 628 pxor @t[6], @y[5] 629 pxor @t[7], @y[6] 630 pxor @t[3], @t[6] # restore t[6] 631 632 pshufd \$0x93, @t[5], @t[5] 633 pshufd \$0x93, @t[6], @t[6] 634 pshufd \$0x93, @t[7], @t[7] 635 pshufd \$0x93, @t[3], @t[3] 636 637 # multiplication by 0x09 638 pxor @y[1], @y[4] 639 pxor @y[1], @t[1] # t[1]=y[1] 640 pxor @t[5], @t[0] # clobber t[0] 641 pxor @t[5], @t[1] 642 pxor @t[0], @y[3] 643 pxor @y[0], @t[0] # t[0]=y[0] 644 pxor @t[6], @t[1] 645 pxor @t[7], @t[6] # clobber t[6] 646 pxor @t[1], @y[4] 647 pxor @t[4], @y[7] 648 pxor @y[4], @t[4] # t[4]=y[4] 649 pxor @t[3], @y[6] 650 pxor @y[3], @t[3] # t[3]=y[3] 651 pxor @t[2], @y[5] 652 pxor @y[2], @t[2] # t[2]=y[2] 653 pxor @t[7], @t[3] 654 pxor @y[5], @t[5] # t[5]=y[5] 655 pxor @t[6], @t[2] 656 pxor @t[6], @t[5] 657 pxor @y[6], @t[6] # t[6]=y[6] 658 pxor @y[7], @t[7] # t[7]=y[7] 659 660 movdqa @t[0],@XMM[0] 661 movdqa @t[1],@XMM[1] 662 movdqa @t[2],@XMM[2] 663 movdqa @t[3],@XMM[3] 664 movdqa @t[4],@XMM[4] 665 movdqa @t[5],@XMM[5] 666 movdqa @t[6],@XMM[6] 667 movdqa @t[7],@XMM[7] 668 ___ 669 } 670 671 sub InvMixColumns { 672 my @x=@_[0..7]; 673 my @t=@_[8..15]; 674 675 # Thanks to Jussi Kivilinna for providing pointer to 676 # 677 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 678 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 679 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 680 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 681 682 $code.=<<___; 683 # multiplication by 0x05-0x00-0x04-0x00 684 pshufd \$0x4E, @x[0], @t[0] 685 pshufd \$0x4E, @x[6], @t[6] 686 pxor @x[0], @t[0] 687 pshufd \$0x4E, @x[7], @t[7] 688 pxor @x[6], @t[6] 689 pshufd \$0x4E, @x[1], @t[1] 690 pxor @x[7], @t[7] 691 pshufd \$0x4E, @x[2], @t[2] 692 pxor @x[1], @t[1] 693 pshufd \$0x4E, @x[3], @t[3] 694 pxor @x[2], @t[2] 695 pxor @t[6], @x[0] 696 pxor @t[6], @x[1] 697 pshufd \$0x4E, @x[4], @t[4] 698 pxor @x[3], @t[3] 699 pxor @t[0], @x[2] 700 pxor @t[1], @x[3] 701 pshufd \$0x4E, @x[5], @t[5] 702 pxor @x[4], @t[4] 703 pxor @t[7], @x[1] 704 pxor @t[2], @x[4] 705 pxor @x[5], @t[5] 706 707 pxor @t[7], @x[2] 708 pxor @t[6], @x[3] 709 pxor @t[6], @x[4] 710 pxor @t[3], @x[5] 711 pxor @t[4], @x[6] 712 pxor @t[7], @x[4] 713 pxor @t[7], @x[5] 714 pxor @t[5], @x[7] 715 ___ 716 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 717 } 718 719 sub aesenc { # not used 720 my @b=@_[0..7]; 721 my @t=@_[8..15]; 722 $code.=<<___; 723 movdqa 0x30($const),@t[0] # .LSR 724 ___ 725 &ShiftRows (@b,@t[0]); 726 &Sbox (@b,@t); 727 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 728 } 729 730 sub aesenclast { # not used 731 my @b=@_[0..7]; 732 my @t=@_[8..15]; 733 $code.=<<___; 734 movdqa 0x40($const),@t[0] # .LSRM0 735 ___ 736 &ShiftRows (@b,@t[0]); 737 &Sbox (@b,@t); 738 $code.=<<___ 739 pxor 0x00($key),@b[0] 740 pxor 0x10($key),@b[1] 741 pxor 0x20($key),@b[4] 742 pxor 0x30($key),@b[6] 743 pxor 0x40($key),@b[3] 744 pxor 0x50($key),@b[7] 745 pxor 0x60($key),@b[2] 746 pxor 0x70($key),@b[5] 747 ___ 748 } 749 750 sub swapmove { 751 my ($a,$b,$n,$mask,$t)=@_; 752 $code.=<<___; 753 movdqa $b,$t 754 psrlq \$$n,$b 755 pxor $a,$b 756 pand $mask,$b 757 pxor $b,$a 758 psllq \$$n,$b 759 pxor $t,$b 760 ___ 761 } 762 sub swapmove2x { 763 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 764 $code.=<<___; 765 movdqa $b0,$t0 766 psrlq \$$n,$b0 767 movdqa $b1,$t1 768 psrlq \$$n,$b1 769 pxor $a0,$b0 770 pxor $a1,$b1 771 pand $mask,$b0 772 pand $mask,$b1 773 pxor $b0,$a0 774 psllq \$$n,$b0 775 pxor $b1,$a1 776 psllq \$$n,$b1 777 pxor $t0,$b0 778 pxor $t1,$b1 779 ___ 780 } 781 782 sub bitslice { 783 my @x=reverse(@_[0..7]); 784 my ($t0,$t1,$t2,$t3)=@_[8..11]; 785 $code.=<<___; 786 movdqa 0x00($const),$t0 # .LBS0 787 movdqa 0x10($const),$t1 # .LBS1 788 ___ 789 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 790 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 791 $code.=<<___; 792 movdqa 0x20($const),$t0 # .LBS2 793 ___ 794 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 795 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 796 797 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 798 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 799 } 800 801 $code.=<<___; 802 .text 803 804 .extern asm_AES_encrypt 805 .extern asm_AES_decrypt 806 807 .type _bsaes_encrypt8,\@abi-omnipotent 808 .align 64 809 _bsaes_encrypt8: 810 lea .LBS0(%rip), $const # constants table 811 812 movdqa ($key), @XMM[9] # round 0 key 813 lea 0x10($key), $key 814 movdqa 0x50($const), @XMM[8] # .LM0SR 815 pxor @XMM[9], @XMM[0] # xor with round0 key 816 pxor @XMM[9], @XMM[1] 817 pxor @XMM[9], @XMM[2] 818 pxor @XMM[9], @XMM[3] 819 pshufb @XMM[8], @XMM[0] 820 pshufb @XMM[8], @XMM[1] 821 pxor @XMM[9], @XMM[4] 822 pxor @XMM[9], @XMM[5] 823 pshufb @XMM[8], @XMM[2] 824 pshufb @XMM[8], @XMM[3] 825 pxor @XMM[9], @XMM[6] 826 pxor @XMM[9], @XMM[7] 827 pshufb @XMM[8], @XMM[4] 828 pshufb @XMM[8], @XMM[5] 829 pshufb @XMM[8], @XMM[6] 830 pshufb @XMM[8], @XMM[7] 831 _bsaes_encrypt8_bitslice: 832 ___ 833 &bitslice (@XMM[0..7, 8..11]); 834 $code.=<<___; 835 dec $rounds 836 jmp .Lenc_sbox 837 .align 16 838 .Lenc_loop: 839 ___ 840 &ShiftRows (@XMM[0..7, 8]); 841 $code.=".Lenc_sbox:\n"; 842 &Sbox (@XMM[0..7, 8..15]); 843 $code.=<<___; 844 dec $rounds 845 jl .Lenc_done 846 ___ 847 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 848 $code.=<<___; 849 movdqa 0x30($const), @XMM[8] # .LSR 850 jnz .Lenc_loop 851 movdqa 0x40($const), @XMM[8] # .LSRM0 852 jmp .Lenc_loop 853 .align 16 854 .Lenc_done: 855 ___ 856 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 857 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 858 $code.=<<___; 859 movdqa ($key), @XMM[8] # last round key 860 pxor @XMM[8], @XMM[4] 861 pxor @XMM[8], @XMM[6] 862 pxor @XMM[8], @XMM[3] 863 pxor @XMM[8], @XMM[7] 864 pxor @XMM[8], @XMM[2] 865 pxor @XMM[8], @XMM[5] 866 pxor @XMM[8], @XMM[0] 867 pxor @XMM[8], @XMM[1] 868 ret 869 .size _bsaes_encrypt8,.-_bsaes_encrypt8 870 871 .type _bsaes_decrypt8,\@abi-omnipotent 872 .align 64 873 _bsaes_decrypt8: 874 lea .LBS0(%rip), $const # constants table 875 876 movdqa ($key), @XMM[9] # round 0 key 877 lea 0x10($key), $key 878 movdqa -0x30($const), @XMM[8] # .LM0ISR 879 pxor @XMM[9], @XMM[0] # xor with round0 key 880 pxor @XMM[9], @XMM[1] 881 pxor @XMM[9], @XMM[2] 882 pxor @XMM[9], @XMM[3] 883 pshufb @XMM[8], @XMM[0] 884 pshufb @XMM[8], @XMM[1] 885 pxor @XMM[9], @XMM[4] 886 pxor @XMM[9], @XMM[5] 887 pshufb @XMM[8], @XMM[2] 888 pshufb @XMM[8], @XMM[3] 889 pxor @XMM[9], @XMM[6] 890 pxor @XMM[9], @XMM[7] 891 pshufb @XMM[8], @XMM[4] 892 pshufb @XMM[8], @XMM[5] 893 pshufb @XMM[8], @XMM[6] 894 pshufb @XMM[8], @XMM[7] 895 ___ 896 &bitslice (@XMM[0..7, 8..11]); 897 $code.=<<___; 898 dec $rounds 899 jmp .Ldec_sbox 900 .align 16 901 .Ldec_loop: 902 ___ 903 &ShiftRows (@XMM[0..7, 8]); 904 $code.=".Ldec_sbox:\n"; 905 &InvSbox (@XMM[0..7, 8..15]); 906 $code.=<<___; 907 dec $rounds 908 jl .Ldec_done 909 ___ 910 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 911 $code.=<<___; 912 movdqa -0x10($const), @XMM[8] # .LISR 913 jnz .Ldec_loop 914 movdqa -0x20($const), @XMM[8] # .LISRM0 915 jmp .Ldec_loop 916 .align 16 917 .Ldec_done: 918 ___ 919 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 920 $code.=<<___; 921 movdqa ($key), @XMM[8] # last round key 922 pxor @XMM[8], @XMM[6] 923 pxor @XMM[8], @XMM[4] 924 pxor @XMM[8], @XMM[2] 925 pxor @XMM[8], @XMM[7] 926 pxor @XMM[8], @XMM[3] 927 pxor @XMM[8], @XMM[5] 928 pxor @XMM[8], @XMM[0] 929 pxor @XMM[8], @XMM[1] 930 ret 931 .size _bsaes_decrypt8,.-_bsaes_decrypt8 932 ___ 933 } 934 { 935 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 936 937 sub bitslice_key { 938 my @x=reverse(@_[0..7]); 939 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 940 941 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 942 $code.=<<___; 943 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 944 movdqa @x[0], @x[2] 945 movdqa @x[1], @x[3] 946 ___ 947 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 948 949 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 950 $code.=<<___; 951 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 952 movdqa @x[0], @x[4] 953 movdqa @x[2], @x[6] 954 movdqa @x[1], @x[5] 955 movdqa @x[3], @x[7] 956 ___ 957 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 958 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 959 } 960 961 $code.=<<___; 962 .type _bsaes_key_convert,\@abi-omnipotent 963 .align 16 964 _bsaes_key_convert: 965 lea .Lmasks(%rip), $const 966 movdqu ($inp), %xmm7 # load round 0 key 967 lea 0x10($inp), $inp 968 movdqa 0x00($const), %xmm0 # 0x01... 969 movdqa 0x10($const), %xmm1 # 0x02... 970 movdqa 0x20($const), %xmm2 # 0x04... 971 movdqa 0x30($const), %xmm3 # 0x08... 972 movdqa 0x40($const), %xmm4 # .LM0 973 pcmpeqd %xmm5, %xmm5 # .LNOT 974 975 movdqu ($inp), %xmm6 # load round 1 key 976 movdqa %xmm7, ($out) # save round 0 key 977 lea 0x10($out), $out 978 dec $rounds 979 jmp .Lkey_loop 980 .align 16 981 .Lkey_loop: 982 pshufb %xmm4, %xmm6 # .LM0 983 984 movdqa %xmm0, %xmm8 985 movdqa %xmm1, %xmm9 986 987 pand %xmm6, %xmm8 988 pand %xmm6, %xmm9 989 movdqa %xmm2, %xmm10 990 pcmpeqb %xmm0, %xmm8 991 psllq \$4, %xmm0 # 0x10... 992 movdqa %xmm3, %xmm11 993 pcmpeqb %xmm1, %xmm9 994 psllq \$4, %xmm1 # 0x20... 995 996 pand %xmm6, %xmm10 997 pand %xmm6, %xmm11 998 movdqa %xmm0, %xmm12 999 pcmpeqb %xmm2, %xmm10 1000 psllq \$4, %xmm2 # 0x40... 1001 movdqa %xmm1, %xmm13 1002 pcmpeqb %xmm3, %xmm11 1003 psllq \$4, %xmm3 # 0x80... 1004 1005 movdqa %xmm2, %xmm14 1006 movdqa %xmm3, %xmm15 1007 pxor %xmm5, %xmm8 # "pnot" 1008 pxor %xmm5, %xmm9 1009 1010 pand %xmm6, %xmm12 1011 pand %xmm6, %xmm13 1012 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1013 pcmpeqb %xmm0, %xmm12 1014 psrlq \$4, %xmm0 # 0x01... 1015 movdqa %xmm9, 0x10($out) 1016 pcmpeqb %xmm1, %xmm13 1017 psrlq \$4, %xmm1 # 0x02... 1018 lea 0x10($inp), $inp 1019 1020 pand %xmm6, %xmm14 1021 pand %xmm6, %xmm15 1022 movdqa %xmm10, 0x20($out) 1023 pcmpeqb %xmm2, %xmm14 1024 psrlq \$4, %xmm2 # 0x04... 1025 movdqa %xmm11, 0x30($out) 1026 pcmpeqb %xmm3, %xmm15 1027 psrlq \$4, %xmm3 # 0x08... 1028 movdqu ($inp), %xmm6 # load next round key 1029 1030 pxor %xmm5, %xmm13 # "pnot" 1031 pxor %xmm5, %xmm14 1032 movdqa %xmm12, 0x40($out) 1033 movdqa %xmm13, 0x50($out) 1034 movdqa %xmm14, 0x60($out) 1035 movdqa %xmm15, 0x70($out) 1036 lea 0x80($out),$out 1037 dec $rounds 1038 jnz .Lkey_loop 1039 1040 movdqa 0x50($const), %xmm7 # .L63 1041 #movdqa %xmm6, ($out) # don't save last round key 1042 ret 1043 .size _bsaes_key_convert,.-_bsaes_key_convert 1044 ___ 1045 } 1046 1047 if (0 && !$win64) { # following four functions are unsupported interface 1048 # used for benchmarking... 1049 $code.=<<___; 1050 .globl bsaes_enc_key_convert 1051 .type bsaes_enc_key_convert,\@function,2 1052 .align 16 1053 bsaes_enc_key_convert: 1054 mov 240($inp),%r10d # pass rounds 1055 mov $inp,%rcx # pass key 1056 mov $out,%rax # pass key schedule 1057 call _bsaes_key_convert 1058 pxor %xmm6,%xmm7 # fix up last round key 1059 movdqa %xmm7,(%rax) # save last round key 1060 ret 1061 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1062 1063 .globl bsaes_encrypt_128 1064 .type bsaes_encrypt_128,\@function,4 1065 .align 16 1066 bsaes_encrypt_128: 1067 .Lenc128_loop: 1068 movdqu 0x00($inp), @XMM[0] # load input 1069 movdqu 0x10($inp), @XMM[1] 1070 movdqu 0x20($inp), @XMM[2] 1071 movdqu 0x30($inp), @XMM[3] 1072 movdqu 0x40($inp), @XMM[4] 1073 movdqu 0x50($inp), @XMM[5] 1074 movdqu 0x60($inp), @XMM[6] 1075 movdqu 0x70($inp), @XMM[7] 1076 mov $key, %rax # pass the $key 1077 lea 0x80($inp), $inp 1078 mov \$10,%r10d 1079 1080 call _bsaes_encrypt8 1081 1082 movdqu @XMM[0], 0x00($out) # write output 1083 movdqu @XMM[1], 0x10($out) 1084 movdqu @XMM[4], 0x20($out) 1085 movdqu @XMM[6], 0x30($out) 1086 movdqu @XMM[3], 0x40($out) 1087 movdqu @XMM[7], 0x50($out) 1088 movdqu @XMM[2], 0x60($out) 1089 movdqu @XMM[5], 0x70($out) 1090 lea 0x80($out), $out 1091 sub \$0x80,$len 1092 ja .Lenc128_loop 1093 ret 1094 .size bsaes_encrypt_128,.-bsaes_encrypt_128 1095 1096 .globl bsaes_dec_key_convert 1097 .type bsaes_dec_key_convert,\@function,2 1098 .align 16 1099 bsaes_dec_key_convert: 1100 mov 240($inp),%r10d # pass rounds 1101 mov $inp,%rcx # pass key 1102 mov $out,%rax # pass key schedule 1103 call _bsaes_key_convert 1104 pxor ($out),%xmm7 # fix up round 0 key 1105 movdqa %xmm6,(%rax) # save last round key 1106 movdqa %xmm7,($out) 1107 ret 1108 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1109 1110 .globl bsaes_decrypt_128 1111 .type bsaes_decrypt_128,\@function,4 1112 .align 16 1113 bsaes_decrypt_128: 1114 .Ldec128_loop: 1115 movdqu 0x00($inp), @XMM[0] # load input 1116 movdqu 0x10($inp), @XMM[1] 1117 movdqu 0x20($inp), @XMM[2] 1118 movdqu 0x30($inp), @XMM[3] 1119 movdqu 0x40($inp), @XMM[4] 1120 movdqu 0x50($inp), @XMM[5] 1121 movdqu 0x60($inp), @XMM[6] 1122 movdqu 0x70($inp), @XMM[7] 1123 mov $key, %rax # pass the $key 1124 lea 0x80($inp), $inp 1125 mov \$10,%r10d 1126 1127 call _bsaes_decrypt8 1128 1129 movdqu @XMM[0], 0x00($out) # write output 1130 movdqu @XMM[1], 0x10($out) 1131 movdqu @XMM[6], 0x20($out) 1132 movdqu @XMM[4], 0x30($out) 1133 movdqu @XMM[2], 0x40($out) 1134 movdqu @XMM[7], 0x50($out) 1135 movdqu @XMM[3], 0x60($out) 1136 movdqu @XMM[5], 0x70($out) 1137 lea 0x80($out), $out 1138 sub \$0x80,$len 1139 ja .Ldec128_loop 1140 ret 1141 .size bsaes_decrypt_128,.-bsaes_decrypt_128 1142 ___ 1143 } 1144 { 1145 ###################################################################### 1146 # 1147 # OpenSSL interface 1148 # 1149 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1150 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1151 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1152 1153 if ($ecb) { 1154 $code.=<<___; 1155 .globl bsaes_ecb_encrypt_blocks 1156 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1157 .align 16 1158 bsaes_ecb_encrypt_blocks: 1159 mov %rsp, %rax 1160 .Lecb_enc_prologue: 1161 push %rbp 1162 push %rbx 1163 push %r12 1164 push %r13 1165 push %r14 1166 push %r15 1167 lea -0x48(%rsp),%rsp 1168 ___ 1169 $code.=<<___ if ($win64); 1170 lea -0xa0(%rsp), %rsp 1171 movaps %xmm6, 0x40(%rsp) 1172 movaps %xmm7, 0x50(%rsp) 1173 movaps %xmm8, 0x60(%rsp) 1174 movaps %xmm9, 0x70(%rsp) 1175 movaps %xmm10, 0x80(%rsp) 1176 movaps %xmm11, 0x90(%rsp) 1177 movaps %xmm12, 0xa0(%rsp) 1178 movaps %xmm13, 0xb0(%rsp) 1179 movaps %xmm14, 0xc0(%rsp) 1180 movaps %xmm15, 0xd0(%rsp) 1181 .Lecb_enc_body: 1182 ___ 1183 $code.=<<___; 1184 mov %rsp,%rbp # backup %rsp 1185 mov 240($arg4),%eax # rounds 1186 mov $arg1,$inp # backup arguments 1187 mov $arg2,$out 1188 mov $arg3,$len 1189 mov $arg4,$key 1190 cmp \$8,$arg3 1191 jb .Lecb_enc_short 1192 1193 mov %eax,%ebx # backup rounds 1194 shl \$7,%rax # 128 bytes per inner round key 1195 sub \$`128-32`,%rax # size of bit-sliced key schedule 1196 sub %rax,%rsp 1197 mov %rsp,%rax # pass key schedule 1198 mov $key,%rcx # pass key 1199 mov %ebx,%r10d # pass rounds 1200 call _bsaes_key_convert 1201 pxor %xmm6,%xmm7 # fix up last round key 1202 movdqa %xmm7,(%rax) # save last round key 1203 1204 sub \$8,$len 1205 .Lecb_enc_loop: 1206 movdqu 0x00($inp), @XMM[0] # load input 1207 movdqu 0x10($inp), @XMM[1] 1208 movdqu 0x20($inp), @XMM[2] 1209 movdqu 0x30($inp), @XMM[3] 1210 movdqu 0x40($inp), @XMM[4] 1211 movdqu 0x50($inp), @XMM[5] 1212 mov %rsp, %rax # pass key schedule 1213 movdqu 0x60($inp), @XMM[6] 1214 mov %ebx,%r10d # pass rounds 1215 movdqu 0x70($inp), @XMM[7] 1216 lea 0x80($inp), $inp 1217 1218 call _bsaes_encrypt8 1219 1220 movdqu @XMM[0], 0x00($out) # write output 1221 movdqu @XMM[1], 0x10($out) 1222 movdqu @XMM[4], 0x20($out) 1223 movdqu @XMM[6], 0x30($out) 1224 movdqu @XMM[3], 0x40($out) 1225 movdqu @XMM[7], 0x50($out) 1226 movdqu @XMM[2], 0x60($out) 1227 movdqu @XMM[5], 0x70($out) 1228 lea 0x80($out), $out 1229 sub \$8,$len 1230 jnc .Lecb_enc_loop 1231 1232 add \$8,$len 1233 jz .Lecb_enc_done 1234 1235 movdqu 0x00($inp), @XMM[0] # load input 1236 mov %rsp, %rax # pass key schedule 1237 mov %ebx,%r10d # pass rounds 1238 cmp \$2,$len 1239 jb .Lecb_enc_one 1240 movdqu 0x10($inp), @XMM[1] 1241 je .Lecb_enc_two 1242 movdqu 0x20($inp), @XMM[2] 1243 cmp \$4,$len 1244 jb .Lecb_enc_three 1245 movdqu 0x30($inp), @XMM[3] 1246 je .Lecb_enc_four 1247 movdqu 0x40($inp), @XMM[4] 1248 cmp \$6,$len 1249 jb .Lecb_enc_five 1250 movdqu 0x50($inp), @XMM[5] 1251 je .Lecb_enc_six 1252 movdqu 0x60($inp), @XMM[6] 1253 call _bsaes_encrypt8 1254 movdqu @XMM[0], 0x00($out) # write output 1255 movdqu @XMM[1], 0x10($out) 1256 movdqu @XMM[4], 0x20($out) 1257 movdqu @XMM[6], 0x30($out) 1258 movdqu @XMM[3], 0x40($out) 1259 movdqu @XMM[7], 0x50($out) 1260 movdqu @XMM[2], 0x60($out) 1261 jmp .Lecb_enc_done 1262 .align 16 1263 .Lecb_enc_six: 1264 call _bsaes_encrypt8 1265 movdqu @XMM[0], 0x00($out) # write output 1266 movdqu @XMM[1], 0x10($out) 1267 movdqu @XMM[4], 0x20($out) 1268 movdqu @XMM[6], 0x30($out) 1269 movdqu @XMM[3], 0x40($out) 1270 movdqu @XMM[7], 0x50($out) 1271 jmp .Lecb_enc_done 1272 .align 16 1273 .Lecb_enc_five: 1274 call _bsaes_encrypt8 1275 movdqu @XMM[0], 0x00($out) # write output 1276 movdqu @XMM[1], 0x10($out) 1277 movdqu @XMM[4], 0x20($out) 1278 movdqu @XMM[6], 0x30($out) 1279 movdqu @XMM[3], 0x40($out) 1280 jmp .Lecb_enc_done 1281 .align 16 1282 .Lecb_enc_four: 1283 call _bsaes_encrypt8 1284 movdqu @XMM[0], 0x00($out) # write output 1285 movdqu @XMM[1], 0x10($out) 1286 movdqu @XMM[4], 0x20($out) 1287 movdqu @XMM[6], 0x30($out) 1288 jmp .Lecb_enc_done 1289 .align 16 1290 .Lecb_enc_three: 1291 call _bsaes_encrypt8 1292 movdqu @XMM[0], 0x00($out) # write output 1293 movdqu @XMM[1], 0x10($out) 1294 movdqu @XMM[4], 0x20($out) 1295 jmp .Lecb_enc_done 1296 .align 16 1297 .Lecb_enc_two: 1298 call _bsaes_encrypt8 1299 movdqu @XMM[0], 0x00($out) # write output 1300 movdqu @XMM[1], 0x10($out) 1301 jmp .Lecb_enc_done 1302 .align 16 1303 .Lecb_enc_one: 1304 call _bsaes_encrypt8 1305 movdqu @XMM[0], 0x00($out) # write output 1306 jmp .Lecb_enc_done 1307 .align 16 1308 .Lecb_enc_short: 1309 lea ($inp), $arg1 1310 lea ($out), $arg2 1311 lea ($key), $arg3 1312 call asm_AES_encrypt 1313 lea 16($inp), $inp 1314 lea 16($out), $out 1315 dec $len 1316 jnz .Lecb_enc_short 1317 1318 .Lecb_enc_done: 1319 lea (%rsp),%rax 1320 pxor %xmm0, %xmm0 1321 .Lecb_enc_bzero: # wipe key schedule [if any] 1322 movdqa %xmm0, 0x00(%rax) 1323 movdqa %xmm0, 0x10(%rax) 1324 lea 0x20(%rax), %rax 1325 cmp %rax, %rbp 1326 jb .Lecb_enc_bzero 1327 1328 lea (%rbp),%rsp # restore %rsp 1329 ___ 1330 $code.=<<___ if ($win64); 1331 movaps 0x40(%rbp), %xmm6 1332 movaps 0x50(%rbp), %xmm7 1333 movaps 0x60(%rbp), %xmm8 1334 movaps 0x70(%rbp), %xmm9 1335 movaps 0x80(%rbp), %xmm10 1336 movaps 0x90(%rbp), %xmm11 1337 movaps 0xa0(%rbp), %xmm12 1338 movaps 0xb0(%rbp), %xmm13 1339 movaps 0xc0(%rbp), %xmm14 1340 movaps 0xd0(%rbp), %xmm15 1341 lea 0xa0(%rbp), %rsp 1342 ___ 1343 $code.=<<___; 1344 mov 0x48(%rsp), %r15 1345 mov 0x50(%rsp), %r14 1346 mov 0x58(%rsp), %r13 1347 mov 0x60(%rsp), %r12 1348 mov 0x68(%rsp), %rbx 1349 mov 0x70(%rsp), %rax 1350 lea 0x78(%rsp), %rsp 1351 mov %rax, %rbp 1352 .Lecb_enc_epilogue: 1353 ret 1354 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1355 1356 .globl bsaes_ecb_decrypt_blocks 1357 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1358 .align 16 1359 bsaes_ecb_decrypt_blocks: 1360 mov %rsp, %rax 1361 .Lecb_dec_prologue: 1362 push %rbp 1363 push %rbx 1364 push %r12 1365 push %r13 1366 push %r14 1367 push %r15 1368 lea -0x48(%rsp),%rsp 1369 ___ 1370 $code.=<<___ if ($win64); 1371 lea -0xa0(%rsp), %rsp 1372 movaps %xmm6, 0x40(%rsp) 1373 movaps %xmm7, 0x50(%rsp) 1374 movaps %xmm8, 0x60(%rsp) 1375 movaps %xmm9, 0x70(%rsp) 1376 movaps %xmm10, 0x80(%rsp) 1377 movaps %xmm11, 0x90(%rsp) 1378 movaps %xmm12, 0xa0(%rsp) 1379 movaps %xmm13, 0xb0(%rsp) 1380 movaps %xmm14, 0xc0(%rsp) 1381 movaps %xmm15, 0xd0(%rsp) 1382 .Lecb_dec_body: 1383 ___ 1384 $code.=<<___; 1385 mov %rsp,%rbp # backup %rsp 1386 mov 240($arg4),%eax # rounds 1387 mov $arg1,$inp # backup arguments 1388 mov $arg2,$out 1389 mov $arg3,$len 1390 mov $arg4,$key 1391 cmp \$8,$arg3 1392 jb .Lecb_dec_short 1393 1394 mov %eax,%ebx # backup rounds 1395 shl \$7,%rax # 128 bytes per inner round key 1396 sub \$`128-32`,%rax # size of bit-sliced key schedule 1397 sub %rax,%rsp 1398 mov %rsp,%rax # pass key schedule 1399 mov $key,%rcx # pass key 1400 mov %ebx,%r10d # pass rounds 1401 call _bsaes_key_convert 1402 pxor (%rsp),%xmm7 # fix up 0 round key 1403 movdqa %xmm6,(%rax) # save last round key 1404 movdqa %xmm7,(%rsp) 1405 1406 sub \$8,$len 1407 .Lecb_dec_loop: 1408 movdqu 0x00($inp), @XMM[0] # load input 1409 movdqu 0x10($inp), @XMM[1] 1410 movdqu 0x20($inp), @XMM[2] 1411 movdqu 0x30($inp), @XMM[3] 1412 movdqu 0x40($inp), @XMM[4] 1413 movdqu 0x50($inp), @XMM[5] 1414 mov %rsp, %rax # pass key schedule 1415 movdqu 0x60($inp), @XMM[6] 1416 mov %ebx,%r10d # pass rounds 1417 movdqu 0x70($inp), @XMM[7] 1418 lea 0x80($inp), $inp 1419 1420 call _bsaes_decrypt8 1421 1422 movdqu @XMM[0], 0x00($out) # write output 1423 movdqu @XMM[1], 0x10($out) 1424 movdqu @XMM[6], 0x20($out) 1425 movdqu @XMM[4], 0x30($out) 1426 movdqu @XMM[2], 0x40($out) 1427 movdqu @XMM[7], 0x50($out) 1428 movdqu @XMM[3], 0x60($out) 1429 movdqu @XMM[5], 0x70($out) 1430 lea 0x80($out), $out 1431 sub \$8,$len 1432 jnc .Lecb_dec_loop 1433 1434 add \$8,$len 1435 jz .Lecb_dec_done 1436 1437 movdqu 0x00($inp), @XMM[0] # load input 1438 mov %rsp, %rax # pass key schedule 1439 mov %ebx,%r10d # pass rounds 1440 cmp \$2,$len 1441 jb .Lecb_dec_one 1442 movdqu 0x10($inp), @XMM[1] 1443 je .Lecb_dec_two 1444 movdqu 0x20($inp), @XMM[2] 1445 cmp \$4,$len 1446 jb .Lecb_dec_three 1447 movdqu 0x30($inp), @XMM[3] 1448 je .Lecb_dec_four 1449 movdqu 0x40($inp), @XMM[4] 1450 cmp \$6,$len 1451 jb .Lecb_dec_five 1452 movdqu 0x50($inp), @XMM[5] 1453 je .Lecb_dec_six 1454 movdqu 0x60($inp), @XMM[6] 1455 call _bsaes_decrypt8 1456 movdqu @XMM[0], 0x00($out) # write output 1457 movdqu @XMM[1], 0x10($out) 1458 movdqu @XMM[6], 0x20($out) 1459 movdqu @XMM[4], 0x30($out) 1460 movdqu @XMM[2], 0x40($out) 1461 movdqu @XMM[7], 0x50($out) 1462 movdqu @XMM[3], 0x60($out) 1463 jmp .Lecb_dec_done 1464 .align 16 1465 .Lecb_dec_six: 1466 call _bsaes_decrypt8 1467 movdqu @XMM[0], 0x00($out) # write output 1468 movdqu @XMM[1], 0x10($out) 1469 movdqu @XMM[6], 0x20($out) 1470 movdqu @XMM[4], 0x30($out) 1471 movdqu @XMM[2], 0x40($out) 1472 movdqu @XMM[7], 0x50($out) 1473 jmp .Lecb_dec_done 1474 .align 16 1475 .Lecb_dec_five: 1476 call _bsaes_decrypt8 1477 movdqu @XMM[0], 0x00($out) # write output 1478 movdqu @XMM[1], 0x10($out) 1479 movdqu @XMM[6], 0x20($out) 1480 movdqu @XMM[4], 0x30($out) 1481 movdqu @XMM[2], 0x40($out) 1482 jmp .Lecb_dec_done 1483 .align 16 1484 .Lecb_dec_four: 1485 call _bsaes_decrypt8 1486 movdqu @XMM[0], 0x00($out) # write output 1487 movdqu @XMM[1], 0x10($out) 1488 movdqu @XMM[6], 0x20($out) 1489 movdqu @XMM[4], 0x30($out) 1490 jmp .Lecb_dec_done 1491 .align 16 1492 .Lecb_dec_three: 1493 call _bsaes_decrypt8 1494 movdqu @XMM[0], 0x00($out) # write output 1495 movdqu @XMM[1], 0x10($out) 1496 movdqu @XMM[6], 0x20($out) 1497 jmp .Lecb_dec_done 1498 .align 16 1499 .Lecb_dec_two: 1500 call _bsaes_decrypt8 1501 movdqu @XMM[0], 0x00($out) # write output 1502 movdqu @XMM[1], 0x10($out) 1503 jmp .Lecb_dec_done 1504 .align 16 1505 .Lecb_dec_one: 1506 call _bsaes_decrypt8 1507 movdqu @XMM[0], 0x00($out) # write output 1508 jmp .Lecb_dec_done 1509 .align 16 1510 .Lecb_dec_short: 1511 lea ($inp), $arg1 1512 lea ($out), $arg2 1513 lea ($key), $arg3 1514 call asm_AES_decrypt 1515 lea 16($inp), $inp 1516 lea 16($out), $out 1517 dec $len 1518 jnz .Lecb_dec_short 1519 1520 .Lecb_dec_done: 1521 lea (%rsp),%rax 1522 pxor %xmm0, %xmm0 1523 .Lecb_dec_bzero: # wipe key schedule [if any] 1524 movdqa %xmm0, 0x00(%rax) 1525 movdqa %xmm0, 0x10(%rax) 1526 lea 0x20(%rax), %rax 1527 cmp %rax, %rbp 1528 jb .Lecb_dec_bzero 1529 1530 lea (%rbp),%rsp # restore %rsp 1531 ___ 1532 $code.=<<___ if ($win64); 1533 movaps 0x40(%rbp), %xmm6 1534 movaps 0x50(%rbp), %xmm7 1535 movaps 0x60(%rbp), %xmm8 1536 movaps 0x70(%rbp), %xmm9 1537 movaps 0x80(%rbp), %xmm10 1538 movaps 0x90(%rbp), %xmm11 1539 movaps 0xa0(%rbp), %xmm12 1540 movaps 0xb0(%rbp), %xmm13 1541 movaps 0xc0(%rbp), %xmm14 1542 movaps 0xd0(%rbp), %xmm15 1543 lea 0xa0(%rbp), %rsp 1544 ___ 1545 $code.=<<___; 1546 mov 0x48(%rsp), %r15 1547 mov 0x50(%rsp), %r14 1548 mov 0x58(%rsp), %r13 1549 mov 0x60(%rsp), %r12 1550 mov 0x68(%rsp), %rbx 1551 mov 0x70(%rsp), %rax 1552 lea 0x78(%rsp), %rsp 1553 mov %rax, %rbp 1554 .Lecb_dec_epilogue: 1555 ret 1556 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1557 ___ 1558 } 1559 $code.=<<___; 1560 .extern asm_AES_cbc_encrypt 1561 .globl bsaes_cbc_encrypt 1562 .type bsaes_cbc_encrypt,\@abi-omnipotent 1563 .align 16 1564 bsaes_cbc_encrypt: 1565 ___ 1566 $code.=<<___ if ($win64); 1567 mov 48(%rsp),$arg6 # pull direction flag 1568 ___ 1569 $code.=<<___; 1570 cmp \$0,$arg6 1571 jne asm_AES_cbc_encrypt 1572 cmp \$128,$arg3 1573 jb asm_AES_cbc_encrypt 1574 1575 mov %rsp, %rax 1576 .Lcbc_dec_prologue: 1577 push %rbp 1578 push %rbx 1579 push %r12 1580 push %r13 1581 push %r14 1582 push %r15 1583 lea -0x48(%rsp), %rsp 1584 ___ 1585 $code.=<<___ if ($win64); 1586 mov 0xa0(%rsp),$arg5 # pull ivp 1587 lea -0xa0(%rsp), %rsp 1588 movaps %xmm6, 0x40(%rsp) 1589 movaps %xmm7, 0x50(%rsp) 1590 movaps %xmm8, 0x60(%rsp) 1591 movaps %xmm9, 0x70(%rsp) 1592 movaps %xmm10, 0x80(%rsp) 1593 movaps %xmm11, 0x90(%rsp) 1594 movaps %xmm12, 0xa0(%rsp) 1595 movaps %xmm13, 0xb0(%rsp) 1596 movaps %xmm14, 0xc0(%rsp) 1597 movaps %xmm15, 0xd0(%rsp) 1598 .Lcbc_dec_body: 1599 ___ 1600 $code.=<<___; 1601 mov %rsp, %rbp # backup %rsp 1602 mov 240($arg4), %eax # rounds 1603 mov $arg1, $inp # backup arguments 1604 mov $arg2, $out 1605 mov $arg3, $len 1606 mov $arg4, $key 1607 mov $arg5, %rbx 1608 shr \$4, $len # bytes to blocks 1609 1610 mov %eax, %edx # rounds 1611 shl \$7, %rax # 128 bytes per inner round key 1612 sub \$`128-32`, %rax # size of bit-sliced key schedule 1613 sub %rax, %rsp 1614 1615 mov %rsp, %rax # pass key schedule 1616 mov $key, %rcx # pass key 1617 mov %edx, %r10d # pass rounds 1618 call _bsaes_key_convert 1619 pxor (%rsp),%xmm7 # fix up 0 round key 1620 movdqa %xmm6,(%rax) # save last round key 1621 movdqa %xmm7,(%rsp) 1622 1623 movdqu (%rbx), @XMM[15] # load IV 1624 sub \$8,$len 1625 .Lcbc_dec_loop: 1626 movdqu 0x00($inp), @XMM[0] # load input 1627 movdqu 0x10($inp), @XMM[1] 1628 movdqu 0x20($inp), @XMM[2] 1629 movdqu 0x30($inp), @XMM[3] 1630 movdqu 0x40($inp), @XMM[4] 1631 movdqu 0x50($inp), @XMM[5] 1632 mov %rsp, %rax # pass key schedule 1633 movdqu 0x60($inp), @XMM[6] 1634 mov %edx,%r10d # pass rounds 1635 movdqu 0x70($inp), @XMM[7] 1636 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1637 1638 call _bsaes_decrypt8 1639 1640 pxor 0x20(%rbp), @XMM[0] # ^= IV 1641 movdqu 0x00($inp), @XMM[8] # re-load input 1642 movdqu 0x10($inp), @XMM[9] 1643 pxor @XMM[8], @XMM[1] 1644 movdqu 0x20($inp), @XMM[10] 1645 pxor @XMM[9], @XMM[6] 1646 movdqu 0x30($inp), @XMM[11] 1647 pxor @XMM[10], @XMM[4] 1648 movdqu 0x40($inp), @XMM[12] 1649 pxor @XMM[11], @XMM[2] 1650 movdqu 0x50($inp), @XMM[13] 1651 pxor @XMM[12], @XMM[7] 1652 movdqu 0x60($inp), @XMM[14] 1653 pxor @XMM[13], @XMM[3] 1654 movdqu 0x70($inp), @XMM[15] # IV 1655 pxor @XMM[14], @XMM[5] 1656 movdqu @XMM[0], 0x00($out) # write output 1657 lea 0x80($inp), $inp 1658 movdqu @XMM[1], 0x10($out) 1659 movdqu @XMM[6], 0x20($out) 1660 movdqu @XMM[4], 0x30($out) 1661 movdqu @XMM[2], 0x40($out) 1662 movdqu @XMM[7], 0x50($out) 1663 movdqu @XMM[3], 0x60($out) 1664 movdqu @XMM[5], 0x70($out) 1665 lea 0x80($out), $out 1666 sub \$8,$len 1667 jnc .Lcbc_dec_loop 1668 1669 add \$8,$len 1670 jz .Lcbc_dec_done 1671 1672 movdqu 0x00($inp), @XMM[0] # load input 1673 mov %rsp, %rax # pass key schedule 1674 mov %edx, %r10d # pass rounds 1675 cmp \$2,$len 1676 jb .Lcbc_dec_one 1677 movdqu 0x10($inp), @XMM[1] 1678 je .Lcbc_dec_two 1679 movdqu 0x20($inp), @XMM[2] 1680 cmp \$4,$len 1681 jb .Lcbc_dec_three 1682 movdqu 0x30($inp), @XMM[3] 1683 je .Lcbc_dec_four 1684 movdqu 0x40($inp), @XMM[4] 1685 cmp \$6,$len 1686 jb .Lcbc_dec_five 1687 movdqu 0x50($inp), @XMM[5] 1688 je .Lcbc_dec_six 1689 movdqu 0x60($inp), @XMM[6] 1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1691 call _bsaes_decrypt8 1692 pxor 0x20(%rbp), @XMM[0] # ^= IV 1693 movdqu 0x00($inp), @XMM[8] # re-load input 1694 movdqu 0x10($inp), @XMM[9] 1695 pxor @XMM[8], @XMM[1] 1696 movdqu 0x20($inp), @XMM[10] 1697 pxor @XMM[9], @XMM[6] 1698 movdqu 0x30($inp), @XMM[11] 1699 pxor @XMM[10], @XMM[4] 1700 movdqu 0x40($inp), @XMM[12] 1701 pxor @XMM[11], @XMM[2] 1702 movdqu 0x50($inp), @XMM[13] 1703 pxor @XMM[12], @XMM[7] 1704 movdqu 0x60($inp), @XMM[15] # IV 1705 pxor @XMM[13], @XMM[3] 1706 movdqu @XMM[0], 0x00($out) # write output 1707 movdqu @XMM[1], 0x10($out) 1708 movdqu @XMM[6], 0x20($out) 1709 movdqu @XMM[4], 0x30($out) 1710 movdqu @XMM[2], 0x40($out) 1711 movdqu @XMM[7], 0x50($out) 1712 movdqu @XMM[3], 0x60($out) 1713 jmp .Lcbc_dec_done 1714 .align 16 1715 .Lcbc_dec_six: 1716 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1717 call _bsaes_decrypt8 1718 pxor 0x20(%rbp), @XMM[0] # ^= IV 1719 movdqu 0x00($inp), @XMM[8] # re-load input 1720 movdqu 0x10($inp), @XMM[9] 1721 pxor @XMM[8], @XMM[1] 1722 movdqu 0x20($inp), @XMM[10] 1723 pxor @XMM[9], @XMM[6] 1724 movdqu 0x30($inp), @XMM[11] 1725 pxor @XMM[10], @XMM[4] 1726 movdqu 0x40($inp), @XMM[12] 1727 pxor @XMM[11], @XMM[2] 1728 movdqu 0x50($inp), @XMM[15] # IV 1729 pxor @XMM[12], @XMM[7] 1730 movdqu @XMM[0], 0x00($out) # write output 1731 movdqu @XMM[1], 0x10($out) 1732 movdqu @XMM[6], 0x20($out) 1733 movdqu @XMM[4], 0x30($out) 1734 movdqu @XMM[2], 0x40($out) 1735 movdqu @XMM[7], 0x50($out) 1736 jmp .Lcbc_dec_done 1737 .align 16 1738 .Lcbc_dec_five: 1739 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1740 call _bsaes_decrypt8 1741 pxor 0x20(%rbp), @XMM[0] # ^= IV 1742 movdqu 0x00($inp), @XMM[8] # re-load input 1743 movdqu 0x10($inp), @XMM[9] 1744 pxor @XMM[8], @XMM[1] 1745 movdqu 0x20($inp), @XMM[10] 1746 pxor @XMM[9], @XMM[6] 1747 movdqu 0x30($inp), @XMM[11] 1748 pxor @XMM[10], @XMM[4] 1749 movdqu 0x40($inp), @XMM[15] # IV 1750 pxor @XMM[11], @XMM[2] 1751 movdqu @XMM[0], 0x00($out) # write output 1752 movdqu @XMM[1], 0x10($out) 1753 movdqu @XMM[6], 0x20($out) 1754 movdqu @XMM[4], 0x30($out) 1755 movdqu @XMM[2], 0x40($out) 1756 jmp .Lcbc_dec_done 1757 .align 16 1758 .Lcbc_dec_four: 1759 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1760 call _bsaes_decrypt8 1761 pxor 0x20(%rbp), @XMM[0] # ^= IV 1762 movdqu 0x00($inp), @XMM[8] # re-load input 1763 movdqu 0x10($inp), @XMM[9] 1764 pxor @XMM[8], @XMM[1] 1765 movdqu 0x20($inp), @XMM[10] 1766 pxor @XMM[9], @XMM[6] 1767 movdqu 0x30($inp), @XMM[15] # IV 1768 pxor @XMM[10], @XMM[4] 1769 movdqu @XMM[0], 0x00($out) # write output 1770 movdqu @XMM[1], 0x10($out) 1771 movdqu @XMM[6], 0x20($out) 1772 movdqu @XMM[4], 0x30($out) 1773 jmp .Lcbc_dec_done 1774 .align 16 1775 .Lcbc_dec_three: 1776 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1777 call _bsaes_decrypt8 1778 pxor 0x20(%rbp), @XMM[0] # ^= IV 1779 movdqu 0x00($inp), @XMM[8] # re-load input 1780 movdqu 0x10($inp), @XMM[9] 1781 pxor @XMM[8], @XMM[1] 1782 movdqu 0x20($inp), @XMM[15] # IV 1783 pxor @XMM[9], @XMM[6] 1784 movdqu @XMM[0], 0x00($out) # write output 1785 movdqu @XMM[1], 0x10($out) 1786 movdqu @XMM[6], 0x20($out) 1787 jmp .Lcbc_dec_done 1788 .align 16 1789 .Lcbc_dec_two: 1790 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1791 call _bsaes_decrypt8 1792 pxor 0x20(%rbp), @XMM[0] # ^= IV 1793 movdqu 0x00($inp), @XMM[8] # re-load input 1794 movdqu 0x10($inp), @XMM[15] # IV 1795 pxor @XMM[8], @XMM[1] 1796 movdqu @XMM[0], 0x00($out) # write output 1797 movdqu @XMM[1], 0x10($out) 1798 jmp .Lcbc_dec_done 1799 .align 16 1800 .Lcbc_dec_one: 1801 lea ($inp), $arg1 1802 lea 0x20(%rbp), $arg2 # buffer output 1803 lea ($key), $arg3 1804 call asm_AES_decrypt # doesn't touch %xmm 1805 pxor 0x20(%rbp), @XMM[15] # ^= IV 1806 movdqu @XMM[15], ($out) # write output 1807 movdqa @XMM[0], @XMM[15] # IV 1808 1809 .Lcbc_dec_done: 1810 movdqu @XMM[15], (%rbx) # return IV 1811 lea (%rsp), %rax 1812 pxor %xmm0, %xmm0 1813 .Lcbc_dec_bzero: # wipe key schedule [if any] 1814 movdqa %xmm0, 0x00(%rax) 1815 movdqa %xmm0, 0x10(%rax) 1816 lea 0x20(%rax), %rax 1817 cmp %rax, %rbp 1818 ja .Lcbc_dec_bzero 1819 1820 lea (%rbp),%rsp # restore %rsp 1821 ___ 1822 $code.=<<___ if ($win64); 1823 movaps 0x40(%rbp), %xmm6 1824 movaps 0x50(%rbp), %xmm7 1825 movaps 0x60(%rbp), %xmm8 1826 movaps 0x70(%rbp), %xmm9 1827 movaps 0x80(%rbp), %xmm10 1828 movaps 0x90(%rbp), %xmm11 1829 movaps 0xa0(%rbp), %xmm12 1830 movaps 0xb0(%rbp), %xmm13 1831 movaps 0xc0(%rbp), %xmm14 1832 movaps 0xd0(%rbp), %xmm15 1833 lea 0xa0(%rbp), %rsp 1834 ___ 1835 $code.=<<___; 1836 mov 0x48(%rsp), %r15 1837 mov 0x50(%rsp), %r14 1838 mov 0x58(%rsp), %r13 1839 mov 0x60(%rsp), %r12 1840 mov 0x68(%rsp), %rbx 1841 mov 0x70(%rsp), %rax 1842 lea 0x78(%rsp), %rsp 1843 mov %rax, %rbp 1844 .Lcbc_dec_epilogue: 1845 ret 1846 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1847 1848 .globl bsaes_ctr32_encrypt_blocks 1849 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1850 .align 16 1851 bsaes_ctr32_encrypt_blocks: 1852 mov %rsp, %rax 1853 .Lctr_enc_prologue: 1854 push %rbp 1855 push %rbx 1856 push %r12 1857 push %r13 1858 push %r14 1859 push %r15 1860 lea -0x48(%rsp), %rsp 1861 ___ 1862 $code.=<<___ if ($win64); 1863 mov 0xa0(%rsp),$arg5 # pull ivp 1864 lea -0xa0(%rsp), %rsp 1865 movaps %xmm6, 0x40(%rsp) 1866 movaps %xmm7, 0x50(%rsp) 1867 movaps %xmm8, 0x60(%rsp) 1868 movaps %xmm9, 0x70(%rsp) 1869 movaps %xmm10, 0x80(%rsp) 1870 movaps %xmm11, 0x90(%rsp) 1871 movaps %xmm12, 0xa0(%rsp) 1872 movaps %xmm13, 0xb0(%rsp) 1873 movaps %xmm14, 0xc0(%rsp) 1874 movaps %xmm15, 0xd0(%rsp) 1875 .Lctr_enc_body: 1876 ___ 1877 $code.=<<___; 1878 mov %rsp, %rbp # backup %rsp 1879 movdqu ($arg5), %xmm0 # load counter 1880 mov 240($arg4), %eax # rounds 1881 mov $arg1, $inp # backup arguments 1882 mov $arg2, $out 1883 mov $arg3, $len 1884 mov $arg4, $key 1885 movdqa %xmm0, 0x20(%rbp) # copy counter 1886 cmp \$8, $arg3 1887 jb .Lctr_enc_short 1888 1889 mov %eax, %ebx # rounds 1890 shl \$7, %rax # 128 bytes per inner round key 1891 sub \$`128-32`, %rax # size of bit-sliced key schedule 1892 sub %rax, %rsp 1893 1894 mov %rsp, %rax # pass key schedule 1895 mov $key, %rcx # pass key 1896 mov %ebx, %r10d # pass rounds 1897 call _bsaes_key_convert 1898 pxor %xmm6,%xmm7 # fix up last round key 1899 movdqa %xmm7,(%rax) # save last round key 1900 1901 movdqa (%rsp), @XMM[9] # load round0 key 1902 lea .LADD1(%rip), %r11 1903 movdqa 0x20(%rbp), @XMM[0] # counter copy 1904 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1905 pshufb @XMM[8], @XMM[9] # byte swap upper part 1906 pshufb @XMM[8], @XMM[0] 1907 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1908 jmp .Lctr_enc_loop 1909 .align 16 1910 .Lctr_enc_loop: 1911 movdqa @XMM[0], 0x20(%rbp) # save counter 1912 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1913 movdqa @XMM[0], @XMM[2] 1914 paddd 0x00(%r11), @XMM[1] # .LADD1 1915 movdqa @XMM[0], @XMM[3] 1916 paddd 0x10(%r11), @XMM[2] # .LADD2 1917 movdqa @XMM[0], @XMM[4] 1918 paddd 0x20(%r11), @XMM[3] # .LADD3 1919 movdqa @XMM[0], @XMM[5] 1920 paddd 0x30(%r11), @XMM[4] # .LADD4 1921 movdqa @XMM[0], @XMM[6] 1922 paddd 0x40(%r11), @XMM[5] # .LADD5 1923 movdqa @XMM[0], @XMM[7] 1924 paddd 0x50(%r11), @XMM[6] # .LADD6 1925 paddd 0x60(%r11), @XMM[7] # .LADD7 1926 1927 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1928 # to flip byte order in 32-bit counter 1929 movdqa (%rsp), @XMM[9] # round 0 key 1930 lea 0x10(%rsp), %rax # pass key schedule 1931 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1932 pxor @XMM[9], @XMM[0] # xor with round0 key 1933 pxor @XMM[9], @XMM[1] 1934 pxor @XMM[9], @XMM[2] 1935 pxor @XMM[9], @XMM[3] 1936 pshufb @XMM[8], @XMM[0] 1937 pshufb @XMM[8], @XMM[1] 1938 pxor @XMM[9], @XMM[4] 1939 pxor @XMM[9], @XMM[5] 1940 pshufb @XMM[8], @XMM[2] 1941 pshufb @XMM[8], @XMM[3] 1942 pxor @XMM[9], @XMM[6] 1943 pxor @XMM[9], @XMM[7] 1944 pshufb @XMM[8], @XMM[4] 1945 pshufb @XMM[8], @XMM[5] 1946 pshufb @XMM[8], @XMM[6] 1947 pshufb @XMM[8], @XMM[7] 1948 lea .LBS0(%rip), %r11 # constants table 1949 mov %ebx,%r10d # pass rounds 1950 1951 call _bsaes_encrypt8_bitslice 1952 1953 sub \$8,$len 1954 jc .Lctr_enc_loop_done 1955 1956 movdqu 0x00($inp), @XMM[8] # load input 1957 movdqu 0x10($inp), @XMM[9] 1958 movdqu 0x20($inp), @XMM[10] 1959 movdqu 0x30($inp), @XMM[11] 1960 movdqu 0x40($inp), @XMM[12] 1961 movdqu 0x50($inp), @XMM[13] 1962 movdqu 0x60($inp), @XMM[14] 1963 movdqu 0x70($inp), @XMM[15] 1964 lea 0x80($inp),$inp 1965 pxor @XMM[0], @XMM[8] 1966 movdqa 0x20(%rbp), @XMM[0] # load counter 1967 pxor @XMM[9], @XMM[1] 1968 movdqu @XMM[8], 0x00($out) # write output 1969 pxor @XMM[10], @XMM[4] 1970 movdqu @XMM[1], 0x10($out) 1971 pxor @XMM[11], @XMM[6] 1972 movdqu @XMM[4], 0x20($out) 1973 pxor @XMM[12], @XMM[3] 1974 movdqu @XMM[6], 0x30($out) 1975 pxor @XMM[13], @XMM[7] 1976 movdqu @XMM[3], 0x40($out) 1977 pxor @XMM[14], @XMM[2] 1978 movdqu @XMM[7], 0x50($out) 1979 pxor @XMM[15], @XMM[5] 1980 movdqu @XMM[2], 0x60($out) 1981 lea .LADD1(%rip), %r11 1982 movdqu @XMM[5], 0x70($out) 1983 lea 0x80($out), $out 1984 paddd 0x70(%r11), @XMM[0] # .LADD8 1985 jnz .Lctr_enc_loop 1986 1987 jmp .Lctr_enc_done 1988 .align 16 1989 .Lctr_enc_loop_done: 1990 add \$8, $len 1991 movdqu 0x00($inp), @XMM[8] # load input 1992 pxor @XMM[8], @XMM[0] 1993 movdqu @XMM[0], 0x00($out) # write output 1994 cmp \$2,$len 1995 jb .Lctr_enc_done 1996 movdqu 0x10($inp), @XMM[9] 1997 pxor @XMM[9], @XMM[1] 1998 movdqu @XMM[1], 0x10($out) 1999 je .Lctr_enc_done 2000 movdqu 0x20($inp), @XMM[10] 2001 pxor @XMM[10], @XMM[4] 2002 movdqu @XMM[4], 0x20($out) 2003 cmp \$4,$len 2004 jb .Lctr_enc_done 2005 movdqu 0x30($inp), @XMM[11] 2006 pxor @XMM[11], @XMM[6] 2007 movdqu @XMM[6], 0x30($out) 2008 je .Lctr_enc_done 2009 movdqu 0x40($inp), @XMM[12] 2010 pxor @XMM[12], @XMM[3] 2011 movdqu @XMM[3], 0x40($out) 2012 cmp \$6,$len 2013 jb .Lctr_enc_done 2014 movdqu 0x50($inp), @XMM[13] 2015 pxor @XMM[13], @XMM[7] 2016 movdqu @XMM[7], 0x50($out) 2017 je .Lctr_enc_done 2018 movdqu 0x60($inp), @XMM[14] 2019 pxor @XMM[14], @XMM[2] 2020 movdqu @XMM[2], 0x60($out) 2021 jmp .Lctr_enc_done 2022 2023 .align 16 2024 .Lctr_enc_short: 2025 lea 0x20(%rbp), $arg1 2026 lea 0x30(%rbp), $arg2 2027 lea ($key), $arg3 2028 call asm_AES_encrypt 2029 movdqu ($inp), @XMM[1] 2030 lea 16($inp), $inp 2031 mov 0x2c(%rbp), %eax # load 32-bit counter 2032 bswap %eax 2033 pxor 0x30(%rbp), @XMM[1] 2034 inc %eax # increment 2035 movdqu @XMM[1], ($out) 2036 bswap %eax 2037 lea 16($out), $out 2038 mov %eax, 0x2c(%rsp) # save 32-bit counter 2039 dec $len 2040 jnz .Lctr_enc_short 2041 2042 .Lctr_enc_done: 2043 lea (%rsp), %rax 2044 pxor %xmm0, %xmm0 2045 .Lctr_enc_bzero: # wipe key schedule [if any] 2046 movdqa %xmm0, 0x00(%rax) 2047 movdqa %xmm0, 0x10(%rax) 2048 lea 0x20(%rax), %rax 2049 cmp %rax, %rbp 2050 ja .Lctr_enc_bzero 2051 2052 lea (%rbp),%rsp # restore %rsp 2053 ___ 2054 $code.=<<___ if ($win64); 2055 movaps 0x40(%rbp), %xmm6 2056 movaps 0x50(%rbp), %xmm7 2057 movaps 0x60(%rbp), %xmm8 2058 movaps 0x70(%rbp), %xmm9 2059 movaps 0x80(%rbp), %xmm10 2060 movaps 0x90(%rbp), %xmm11 2061 movaps 0xa0(%rbp), %xmm12 2062 movaps 0xb0(%rbp), %xmm13 2063 movaps 0xc0(%rbp), %xmm14 2064 movaps 0xd0(%rbp), %xmm15 2065 lea 0xa0(%rbp), %rsp 2066 ___ 2067 $code.=<<___; 2068 mov 0x48(%rsp), %r15 2069 mov 0x50(%rsp), %r14 2070 mov 0x58(%rsp), %r13 2071 mov 0x60(%rsp), %r12 2072 mov 0x68(%rsp), %rbx 2073 mov 0x70(%rsp), %rax 2074 lea 0x78(%rsp), %rsp 2075 mov %rax, %rbp 2076 .Lctr_enc_epilogue: 2077 ret 2078 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2079 ___ 2080 ###################################################################### 2081 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2082 # const AES_KEY *key1, const AES_KEY *key2, 2083 # const unsigned char iv[16]); 2084 # 2085 my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2086 $arg6=~s/d$//; 2087 2088 $code.=<<___; 2089 .globl bsaes_xts_encrypt 2090 .type bsaes_xts_encrypt,\@abi-omnipotent 2091 .align 16 2092 bsaes_xts_encrypt: 2093 mov %rsp, %rax 2094 .Lxts_enc_prologue: 2095 push %rbp 2096 push %rbx 2097 push %r12 2098 push %r13 2099 push %r14 2100 push %r15 2101 lea -0x48(%rsp), %rsp 2102 ___ 2103 $code.=<<___ if ($win64); 2104 mov 0xa0(%rsp),$arg5 # pull key2 2105 mov 0xa8(%rsp),$arg6 # pull ivp 2106 lea -0xa0(%rsp), %rsp 2107 movaps %xmm6, 0x40(%rsp) 2108 movaps %xmm7, 0x50(%rsp) 2109 movaps %xmm8, 0x60(%rsp) 2110 movaps %xmm9, 0x70(%rsp) 2111 movaps %xmm10, 0x80(%rsp) 2112 movaps %xmm11, 0x90(%rsp) 2113 movaps %xmm12, 0xa0(%rsp) 2114 movaps %xmm13, 0xb0(%rsp) 2115 movaps %xmm14, 0xc0(%rsp) 2116 movaps %xmm15, 0xd0(%rsp) 2117 .Lxts_enc_body: 2118 ___ 2119 $code.=<<___; 2120 mov %rsp, %rbp # backup %rsp 2121 mov $arg1, $inp # backup arguments 2122 mov $arg2, $out 2123 mov $arg3, $len 2124 mov $arg4, $key 2125 2126 lea ($arg6), $arg1 2127 lea 0x20(%rbp), $arg2 2128 lea ($arg5), $arg3 2129 call asm_AES_encrypt # generate initial tweak 2130 2131 mov 240($key), %eax # rounds 2132 mov $len, %rbx # backup $len 2133 2134 mov %eax, %edx # rounds 2135 shl \$7, %rax # 128 bytes per inner round key 2136 sub \$`128-32`, %rax # size of bit-sliced key schedule 2137 sub %rax, %rsp 2138 2139 mov %rsp, %rax # pass key schedule 2140 mov $key, %rcx # pass key 2141 mov %edx, %r10d # pass rounds 2142 call _bsaes_key_convert 2143 pxor %xmm6, %xmm7 # fix up last round key 2144 movdqa %xmm7, (%rax) # save last round key 2145 2146 and \$-16, $len 2147 sub \$0x80, %rsp # place for tweak[8] 2148 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2149 2150 pxor $twtmp, $twtmp 2151 movdqa .Lxts_magic(%rip), $twmask 2152 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2153 2154 sub \$0x80, $len 2155 jc .Lxts_enc_short 2156 jmp .Lxts_enc_loop 2157 2158 .align 16 2159 .Lxts_enc_loop: 2160 ___ 2161 for ($i=0;$i<7;$i++) { 2162 $code.=<<___; 2163 pshufd \$0x13, $twtmp, $twres 2164 pxor $twtmp, $twtmp 2165 movdqa @XMM[7], @XMM[$i] 2166 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2167 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2168 pand $twmask, $twres # isolate carry and residue 2169 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2170 pxor $twres, @XMM[7] 2171 ___ 2172 $code.=<<___ if ($i>=1); 2173 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2174 ___ 2175 $code.=<<___ if ($i>=2); 2176 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2177 ___ 2178 } 2179 $code.=<<___; 2180 movdqu 0x60($inp), @XMM[8+6] 2181 pxor @XMM[8+5], @XMM[5] 2182 movdqu 0x70($inp), @XMM[8+7] 2183 lea 0x80($inp), $inp 2184 movdqa @XMM[7], 0x70(%rsp) 2185 pxor @XMM[8+6], @XMM[6] 2186 lea 0x80(%rsp), %rax # pass key schedule 2187 pxor @XMM[8+7], @XMM[7] 2188 mov %edx, %r10d # pass rounds 2189 2190 call _bsaes_encrypt8 2191 2192 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2193 pxor 0x10(%rsp), @XMM[1] 2194 movdqu @XMM[0], 0x00($out) # write output 2195 pxor 0x20(%rsp), @XMM[4] 2196 movdqu @XMM[1], 0x10($out) 2197 pxor 0x30(%rsp), @XMM[6] 2198 movdqu @XMM[4], 0x20($out) 2199 pxor 0x40(%rsp), @XMM[3] 2200 movdqu @XMM[6], 0x30($out) 2201 pxor 0x50(%rsp), @XMM[7] 2202 movdqu @XMM[3], 0x40($out) 2203 pxor 0x60(%rsp), @XMM[2] 2204 movdqu @XMM[7], 0x50($out) 2205 pxor 0x70(%rsp), @XMM[5] 2206 movdqu @XMM[2], 0x60($out) 2207 movdqu @XMM[5], 0x70($out) 2208 lea 0x80($out), $out 2209 2210 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2211 pxor $twtmp, $twtmp 2212 movdqa .Lxts_magic(%rip), $twmask 2213 pcmpgtd @XMM[7], $twtmp 2214 pshufd \$0x13, $twtmp, $twres 2215 pxor $twtmp, $twtmp 2216 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2217 pand $twmask, $twres # isolate carry and residue 2218 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2219 pxor $twres, @XMM[7] 2220 2221 sub \$0x80,$len 2222 jnc .Lxts_enc_loop 2223 2224 .Lxts_enc_short: 2225 add \$0x80, $len 2226 jz .Lxts_enc_done 2227 ___ 2228 for ($i=0;$i<7;$i++) { 2229 $code.=<<___; 2230 pshufd \$0x13, $twtmp, $twres 2231 pxor $twtmp, $twtmp 2232 movdqa @XMM[7], @XMM[$i] 2233 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2235 pand $twmask, $twres # isolate carry and residue 2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2237 pxor $twres, @XMM[7] 2238 ___ 2239 $code.=<<___ if ($i>=1); 2240 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2241 cmp \$`0x10*$i`,$len 2242 je .Lxts_enc_$i 2243 ___ 2244 $code.=<<___ if ($i>=2); 2245 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2246 ___ 2247 } 2248 $code.=<<___; 2249 movdqu 0x60($inp), @XMM[8+6] 2250 pxor @XMM[8+5], @XMM[5] 2251 movdqa @XMM[7], 0x70(%rsp) 2252 lea 0x70($inp), $inp 2253 pxor @XMM[8+6], @XMM[6] 2254 lea 0x80(%rsp), %rax # pass key schedule 2255 mov %edx, %r10d # pass rounds 2256 2257 call _bsaes_encrypt8 2258 2259 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2260 pxor 0x10(%rsp), @XMM[1] 2261 movdqu @XMM[0], 0x00($out) # write output 2262 pxor 0x20(%rsp), @XMM[4] 2263 movdqu @XMM[1], 0x10($out) 2264 pxor 0x30(%rsp), @XMM[6] 2265 movdqu @XMM[4], 0x20($out) 2266 pxor 0x40(%rsp), @XMM[3] 2267 movdqu @XMM[6], 0x30($out) 2268 pxor 0x50(%rsp), @XMM[7] 2269 movdqu @XMM[3], 0x40($out) 2270 pxor 0x60(%rsp), @XMM[2] 2271 movdqu @XMM[7], 0x50($out) 2272 movdqu @XMM[2], 0x60($out) 2273 lea 0x70($out), $out 2274 2275 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2276 jmp .Lxts_enc_done 2277 .align 16 2278 .Lxts_enc_6: 2279 pxor @XMM[8+4], @XMM[4] 2280 lea 0x60($inp), $inp 2281 pxor @XMM[8+5], @XMM[5] 2282 lea 0x80(%rsp), %rax # pass key schedule 2283 mov %edx, %r10d # pass rounds 2284 2285 call _bsaes_encrypt8 2286 2287 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2288 pxor 0x10(%rsp), @XMM[1] 2289 movdqu @XMM[0], 0x00($out) # write output 2290 pxor 0x20(%rsp), @XMM[4] 2291 movdqu @XMM[1], 0x10($out) 2292 pxor 0x30(%rsp), @XMM[6] 2293 movdqu @XMM[4], 0x20($out) 2294 pxor 0x40(%rsp), @XMM[3] 2295 movdqu @XMM[6], 0x30($out) 2296 pxor 0x50(%rsp), @XMM[7] 2297 movdqu @XMM[3], 0x40($out) 2298 movdqu @XMM[7], 0x50($out) 2299 lea 0x60($out), $out 2300 2301 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2302 jmp .Lxts_enc_done 2303 .align 16 2304 .Lxts_enc_5: 2305 pxor @XMM[8+3], @XMM[3] 2306 lea 0x50($inp), $inp 2307 pxor @XMM[8+4], @XMM[4] 2308 lea 0x80(%rsp), %rax # pass key schedule 2309 mov %edx, %r10d # pass rounds 2310 2311 call _bsaes_encrypt8 2312 2313 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2314 pxor 0x10(%rsp), @XMM[1] 2315 movdqu @XMM[0], 0x00($out) # write output 2316 pxor 0x20(%rsp), @XMM[4] 2317 movdqu @XMM[1], 0x10($out) 2318 pxor 0x30(%rsp), @XMM[6] 2319 movdqu @XMM[4], 0x20($out) 2320 pxor 0x40(%rsp), @XMM[3] 2321 movdqu @XMM[6], 0x30($out) 2322 movdqu @XMM[3], 0x40($out) 2323 lea 0x50($out), $out 2324 2325 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2326 jmp .Lxts_enc_done 2327 .align 16 2328 .Lxts_enc_4: 2329 pxor @XMM[8+2], @XMM[2] 2330 lea 0x40($inp), $inp 2331 pxor @XMM[8+3], @XMM[3] 2332 lea 0x80(%rsp), %rax # pass key schedule 2333 mov %edx, %r10d # pass rounds 2334 2335 call _bsaes_encrypt8 2336 2337 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2338 pxor 0x10(%rsp), @XMM[1] 2339 movdqu @XMM[0], 0x00($out) # write output 2340 pxor 0x20(%rsp), @XMM[4] 2341 movdqu @XMM[1], 0x10($out) 2342 pxor 0x30(%rsp), @XMM[6] 2343 movdqu @XMM[4], 0x20($out) 2344 movdqu @XMM[6], 0x30($out) 2345 lea 0x40($out), $out 2346 2347 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2348 jmp .Lxts_enc_done 2349 .align 16 2350 .Lxts_enc_3: 2351 pxor @XMM[8+1], @XMM[1] 2352 lea 0x30($inp), $inp 2353 pxor @XMM[8+2], @XMM[2] 2354 lea 0x80(%rsp), %rax # pass key schedule 2355 mov %edx, %r10d # pass rounds 2356 2357 call _bsaes_encrypt8 2358 2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2360 pxor 0x10(%rsp), @XMM[1] 2361 movdqu @XMM[0], 0x00($out) # write output 2362 pxor 0x20(%rsp), @XMM[4] 2363 movdqu @XMM[1], 0x10($out) 2364 movdqu @XMM[4], 0x20($out) 2365 lea 0x30($out), $out 2366 2367 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2368 jmp .Lxts_enc_done 2369 .align 16 2370 .Lxts_enc_2: 2371 pxor @XMM[8+0], @XMM[0] 2372 lea 0x20($inp), $inp 2373 pxor @XMM[8+1], @XMM[1] 2374 lea 0x80(%rsp), %rax # pass key schedule 2375 mov %edx, %r10d # pass rounds 2376 2377 call _bsaes_encrypt8 2378 2379 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2380 pxor 0x10(%rsp), @XMM[1] 2381 movdqu @XMM[0], 0x00($out) # write output 2382 movdqu @XMM[1], 0x10($out) 2383 lea 0x20($out), $out 2384 2385 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2386 jmp .Lxts_enc_done 2387 .align 16 2388 .Lxts_enc_1: 2389 pxor @XMM[0], @XMM[8] 2390 lea 0x10($inp), $inp 2391 movdqa @XMM[8], 0x20(%rbp) 2392 lea 0x20(%rbp), $arg1 2393 lea 0x20(%rbp), $arg2 2394 lea ($key), $arg3 2395 call asm_AES_encrypt # doesn't touch %xmm 2396 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2397 #pxor @XMM[8], @XMM[0] 2398 #lea 0x80(%rsp), %rax # pass key schedule 2399 #mov %edx, %r10d # pass rounds 2400 #call _bsaes_encrypt8 2401 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2402 movdqu @XMM[0], 0x00($out) # write output 2403 lea 0x10($out), $out 2404 2405 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2406 2407 .Lxts_enc_done: 2408 and \$15, %ebx 2409 jz .Lxts_enc_ret 2410 mov $out, %rdx 2411 2412 .Lxts_enc_steal: 2413 movzb ($inp), %eax 2414 movzb -16(%rdx), %ecx 2415 lea 1($inp), $inp 2416 mov %al, -16(%rdx) 2417 mov %cl, 0(%rdx) 2418 lea 1(%rdx), %rdx 2419 sub \$1,%ebx 2420 jnz .Lxts_enc_steal 2421 2422 movdqu -16($out), @XMM[0] 2423 lea 0x20(%rbp), $arg1 2424 pxor @XMM[7], @XMM[0] 2425 lea 0x20(%rbp), $arg2 2426 movdqa @XMM[0], 0x20(%rbp) 2427 lea ($key), $arg3 2428 call asm_AES_encrypt # doesn't touch %xmm 2429 pxor 0x20(%rbp), @XMM[7] 2430 movdqu @XMM[7], -16($out) 2431 2432 .Lxts_enc_ret: 2433 lea (%rsp), %rax 2434 pxor %xmm0, %xmm0 2435 .Lxts_enc_bzero: # wipe key schedule [if any] 2436 movdqa %xmm0, 0x00(%rax) 2437 movdqa %xmm0, 0x10(%rax) 2438 lea 0x20(%rax), %rax 2439 cmp %rax, %rbp 2440 ja .Lxts_enc_bzero 2441 2442 lea (%rbp),%rsp # restore %rsp 2443 ___ 2444 $code.=<<___ if ($win64); 2445 movaps 0x40(%rbp), %xmm6 2446 movaps 0x50(%rbp), %xmm7 2447 movaps 0x60(%rbp), %xmm8 2448 movaps 0x70(%rbp), %xmm9 2449 movaps 0x80(%rbp), %xmm10 2450 movaps 0x90(%rbp), %xmm11 2451 movaps 0xa0(%rbp), %xmm12 2452 movaps 0xb0(%rbp), %xmm13 2453 movaps 0xc0(%rbp), %xmm14 2454 movaps 0xd0(%rbp), %xmm15 2455 lea 0xa0(%rbp), %rsp 2456 ___ 2457 $code.=<<___; 2458 mov 0x48(%rsp), %r15 2459 mov 0x50(%rsp), %r14 2460 mov 0x58(%rsp), %r13 2461 mov 0x60(%rsp), %r12 2462 mov 0x68(%rsp), %rbx 2463 mov 0x70(%rsp), %rax 2464 lea 0x78(%rsp), %rsp 2465 mov %rax, %rbp 2466 .Lxts_enc_epilogue: 2467 ret 2468 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2469 2470 .globl bsaes_xts_decrypt 2471 .type bsaes_xts_decrypt,\@abi-omnipotent 2472 .align 16 2473 bsaes_xts_decrypt: 2474 mov %rsp, %rax 2475 .Lxts_dec_prologue: 2476 push %rbp 2477 push %rbx 2478 push %r12 2479 push %r13 2480 push %r14 2481 push %r15 2482 lea -0x48(%rsp), %rsp 2483 ___ 2484 $code.=<<___ if ($win64); 2485 mov 0xa0(%rsp),$arg5 # pull key2 2486 mov 0xa8(%rsp),$arg6 # pull ivp 2487 lea -0xa0(%rsp), %rsp 2488 movaps %xmm6, 0x40(%rsp) 2489 movaps %xmm7, 0x50(%rsp) 2490 movaps %xmm8, 0x60(%rsp) 2491 movaps %xmm9, 0x70(%rsp) 2492 movaps %xmm10, 0x80(%rsp) 2493 movaps %xmm11, 0x90(%rsp) 2494 movaps %xmm12, 0xa0(%rsp) 2495 movaps %xmm13, 0xb0(%rsp) 2496 movaps %xmm14, 0xc0(%rsp) 2497 movaps %xmm15, 0xd0(%rsp) 2498 .Lxts_dec_body: 2499 ___ 2500 $code.=<<___; 2501 mov %rsp, %rbp # backup %rsp 2502 mov $arg1, $inp # backup arguments 2503 mov $arg2, $out 2504 mov $arg3, $len 2505 mov $arg4, $key 2506 2507 lea ($arg6), $arg1 2508 lea 0x20(%rbp), $arg2 2509 lea ($arg5), $arg3 2510 call asm_AES_encrypt # generate initial tweak 2511 2512 mov 240($key), %eax # rounds 2513 mov $len, %rbx # backup $len 2514 2515 mov %eax, %edx # rounds 2516 shl \$7, %rax # 128 bytes per inner round key 2517 sub \$`128-32`, %rax # size of bit-sliced key schedule 2518 sub %rax, %rsp 2519 2520 mov %rsp, %rax # pass key schedule 2521 mov $key, %rcx # pass key 2522 mov %edx, %r10d # pass rounds 2523 call _bsaes_key_convert 2524 pxor (%rsp), %xmm7 # fix up round 0 key 2525 movdqa %xmm6, (%rax) # save last round key 2526 movdqa %xmm7, (%rsp) 2527 2528 xor %eax, %eax # if ($len%16) len-=16; 2529 and \$-16, $len 2530 test \$15, %ebx 2531 setnz %al 2532 shl \$4, %rax 2533 sub %rax, $len 2534 2535 sub \$0x80, %rsp # place for tweak[8] 2536 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2537 2538 pxor $twtmp, $twtmp 2539 movdqa .Lxts_magic(%rip), $twmask 2540 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2541 2542 sub \$0x80, $len 2543 jc .Lxts_dec_short 2544 jmp .Lxts_dec_loop 2545 2546 .align 16 2547 .Lxts_dec_loop: 2548 ___ 2549 for ($i=0;$i<7;$i++) { 2550 $code.=<<___; 2551 pshufd \$0x13, $twtmp, $twres 2552 pxor $twtmp, $twtmp 2553 movdqa @XMM[7], @XMM[$i] 2554 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2555 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2556 pand $twmask, $twres # isolate carry and residue 2557 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2558 pxor $twres, @XMM[7] 2559 ___ 2560 $code.=<<___ if ($i>=1); 2561 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2562 ___ 2563 $code.=<<___ if ($i>=2); 2564 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2565 ___ 2566 } 2567 $code.=<<___; 2568 movdqu 0x60($inp), @XMM[8+6] 2569 pxor @XMM[8+5], @XMM[5] 2570 movdqu 0x70($inp), @XMM[8+7] 2571 lea 0x80($inp), $inp 2572 movdqa @XMM[7], 0x70(%rsp) 2573 pxor @XMM[8+6], @XMM[6] 2574 lea 0x80(%rsp), %rax # pass key schedule 2575 pxor @XMM[8+7], @XMM[7] 2576 mov %edx, %r10d # pass rounds 2577 2578 call _bsaes_decrypt8 2579 2580 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2581 pxor 0x10(%rsp), @XMM[1] 2582 movdqu @XMM[0], 0x00($out) # write output 2583 pxor 0x20(%rsp), @XMM[6] 2584 movdqu @XMM[1], 0x10($out) 2585 pxor 0x30(%rsp), @XMM[4] 2586 movdqu @XMM[6], 0x20($out) 2587 pxor 0x40(%rsp), @XMM[2] 2588 movdqu @XMM[4], 0x30($out) 2589 pxor 0x50(%rsp), @XMM[7] 2590 movdqu @XMM[2], 0x40($out) 2591 pxor 0x60(%rsp), @XMM[3] 2592 movdqu @XMM[7], 0x50($out) 2593 pxor 0x70(%rsp), @XMM[5] 2594 movdqu @XMM[3], 0x60($out) 2595 movdqu @XMM[5], 0x70($out) 2596 lea 0x80($out), $out 2597 2598 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2599 pxor $twtmp, $twtmp 2600 movdqa .Lxts_magic(%rip), $twmask 2601 pcmpgtd @XMM[7], $twtmp 2602 pshufd \$0x13, $twtmp, $twres 2603 pxor $twtmp, $twtmp 2604 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2605 pand $twmask, $twres # isolate carry and residue 2606 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2607 pxor $twres, @XMM[7] 2608 2609 sub \$0x80,$len 2610 jnc .Lxts_dec_loop 2611 2612 .Lxts_dec_short: 2613 add \$0x80, $len 2614 jz .Lxts_dec_done 2615 ___ 2616 for ($i=0;$i<7;$i++) { 2617 $code.=<<___; 2618 pshufd \$0x13, $twtmp, $twres 2619 pxor $twtmp, $twtmp 2620 movdqa @XMM[7], @XMM[$i] 2621 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2622 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2623 pand $twmask, $twres # isolate carry and residue 2624 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2625 pxor $twres, @XMM[7] 2626 ___ 2627 $code.=<<___ if ($i>=1); 2628 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2629 cmp \$`0x10*$i`,$len 2630 je .Lxts_dec_$i 2631 ___ 2632 $code.=<<___ if ($i>=2); 2633 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2634 ___ 2635 } 2636 $code.=<<___; 2637 movdqu 0x60($inp), @XMM[8+6] 2638 pxor @XMM[8+5], @XMM[5] 2639 movdqa @XMM[7], 0x70(%rsp) 2640 lea 0x70($inp), $inp 2641 pxor @XMM[8+6], @XMM[6] 2642 lea 0x80(%rsp), %rax # pass key schedule 2643 mov %edx, %r10d # pass rounds 2644 2645 call _bsaes_decrypt8 2646 2647 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2648 pxor 0x10(%rsp), @XMM[1] 2649 movdqu @XMM[0], 0x00($out) # write output 2650 pxor 0x20(%rsp), @XMM[6] 2651 movdqu @XMM[1], 0x10($out) 2652 pxor 0x30(%rsp), @XMM[4] 2653 movdqu @XMM[6], 0x20($out) 2654 pxor 0x40(%rsp), @XMM[2] 2655 movdqu @XMM[4], 0x30($out) 2656 pxor 0x50(%rsp), @XMM[7] 2657 movdqu @XMM[2], 0x40($out) 2658 pxor 0x60(%rsp), @XMM[3] 2659 movdqu @XMM[7], 0x50($out) 2660 movdqu @XMM[3], 0x60($out) 2661 lea 0x70($out), $out 2662 2663 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2664 jmp .Lxts_dec_done 2665 .align 16 2666 .Lxts_dec_6: 2667 pxor @XMM[8+4], @XMM[4] 2668 lea 0x60($inp), $inp 2669 pxor @XMM[8+5], @XMM[5] 2670 lea 0x80(%rsp), %rax # pass key schedule 2671 mov %edx, %r10d # pass rounds 2672 2673 call _bsaes_decrypt8 2674 2675 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2676 pxor 0x10(%rsp), @XMM[1] 2677 movdqu @XMM[0], 0x00($out) # write output 2678 pxor 0x20(%rsp), @XMM[6] 2679 movdqu @XMM[1], 0x10($out) 2680 pxor 0x30(%rsp), @XMM[4] 2681 movdqu @XMM[6], 0x20($out) 2682 pxor 0x40(%rsp), @XMM[2] 2683 movdqu @XMM[4], 0x30($out) 2684 pxor 0x50(%rsp), @XMM[7] 2685 movdqu @XMM[2], 0x40($out) 2686 movdqu @XMM[7], 0x50($out) 2687 lea 0x60($out), $out 2688 2689 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2690 jmp .Lxts_dec_done 2691 .align 16 2692 .Lxts_dec_5: 2693 pxor @XMM[8+3], @XMM[3] 2694 lea 0x50($inp), $inp 2695 pxor @XMM[8+4], @XMM[4] 2696 lea 0x80(%rsp), %rax # pass key schedule 2697 mov %edx, %r10d # pass rounds 2698 2699 call _bsaes_decrypt8 2700 2701 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2702 pxor 0x10(%rsp), @XMM[1] 2703 movdqu @XMM[0], 0x00($out) # write output 2704 pxor 0x20(%rsp), @XMM[6] 2705 movdqu @XMM[1], 0x10($out) 2706 pxor 0x30(%rsp), @XMM[4] 2707 movdqu @XMM[6], 0x20($out) 2708 pxor 0x40(%rsp), @XMM[2] 2709 movdqu @XMM[4], 0x30($out) 2710 movdqu @XMM[2], 0x40($out) 2711 lea 0x50($out), $out 2712 2713 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2714 jmp .Lxts_dec_done 2715 .align 16 2716 .Lxts_dec_4: 2717 pxor @XMM[8+2], @XMM[2] 2718 lea 0x40($inp), $inp 2719 pxor @XMM[8+3], @XMM[3] 2720 lea 0x80(%rsp), %rax # pass key schedule 2721 mov %edx, %r10d # pass rounds 2722 2723 call _bsaes_decrypt8 2724 2725 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2726 pxor 0x10(%rsp), @XMM[1] 2727 movdqu @XMM[0], 0x00($out) # write output 2728 pxor 0x20(%rsp), @XMM[6] 2729 movdqu @XMM[1], 0x10($out) 2730 pxor 0x30(%rsp), @XMM[4] 2731 movdqu @XMM[6], 0x20($out) 2732 movdqu @XMM[4], 0x30($out) 2733 lea 0x40($out), $out 2734 2735 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2736 jmp .Lxts_dec_done 2737 .align 16 2738 .Lxts_dec_3: 2739 pxor @XMM[8+1], @XMM[1] 2740 lea 0x30($inp), $inp 2741 pxor @XMM[8+2], @XMM[2] 2742 lea 0x80(%rsp), %rax # pass key schedule 2743 mov %edx, %r10d # pass rounds 2744 2745 call _bsaes_decrypt8 2746 2747 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2748 pxor 0x10(%rsp), @XMM[1] 2749 movdqu @XMM[0], 0x00($out) # write output 2750 pxor 0x20(%rsp), @XMM[6] 2751 movdqu @XMM[1], 0x10($out) 2752 movdqu @XMM[6], 0x20($out) 2753 lea 0x30($out), $out 2754 2755 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2756 jmp .Lxts_dec_done 2757 .align 16 2758 .Lxts_dec_2: 2759 pxor @XMM[8+0], @XMM[0] 2760 lea 0x20($inp), $inp 2761 pxor @XMM[8+1], @XMM[1] 2762 lea 0x80(%rsp), %rax # pass key schedule 2763 mov %edx, %r10d # pass rounds 2764 2765 call _bsaes_decrypt8 2766 2767 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2768 pxor 0x10(%rsp), @XMM[1] 2769 movdqu @XMM[0], 0x00($out) # write output 2770 movdqu @XMM[1], 0x10($out) 2771 lea 0x20($out), $out 2772 2773 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2774 jmp .Lxts_dec_done 2775 .align 16 2776 .Lxts_dec_1: 2777 pxor @XMM[0], @XMM[8] 2778 lea 0x10($inp), $inp 2779 movdqa @XMM[8], 0x20(%rbp) 2780 lea 0x20(%rbp), $arg1 2781 lea 0x20(%rbp), $arg2 2782 lea ($key), $arg3 2783 call asm_AES_decrypt # doesn't touch %xmm 2784 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2785 #pxor @XMM[8], @XMM[0] 2786 #lea 0x80(%rsp), %rax # pass key schedule 2787 #mov %edx, %r10d # pass rounds 2788 #call _bsaes_decrypt8 2789 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2790 movdqu @XMM[0], 0x00($out) # write output 2791 lea 0x10($out), $out 2792 2793 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2794 2795 .Lxts_dec_done: 2796 and \$15, %ebx 2797 jz .Lxts_dec_ret 2798 2799 pxor $twtmp, $twtmp 2800 movdqa .Lxts_magic(%rip), $twmask 2801 pcmpgtd @XMM[7], $twtmp 2802 pshufd \$0x13, $twtmp, $twres 2803 movdqa @XMM[7], @XMM[6] 2804 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2805 pand $twmask, $twres # isolate carry and residue 2806 movdqu ($inp), @XMM[0] 2807 pxor $twres, @XMM[7] 2808 2809 lea 0x20(%rbp), $arg1 2810 pxor @XMM[7], @XMM[0] 2811 lea 0x20(%rbp), $arg2 2812 movdqa @XMM[0], 0x20(%rbp) 2813 lea ($key), $arg3 2814 call asm_AES_decrypt # doesn't touch %xmm 2815 pxor 0x20(%rbp), @XMM[7] 2816 mov $out, %rdx 2817 movdqu @XMM[7], ($out) 2818 2819 .Lxts_dec_steal: 2820 movzb 16($inp), %eax 2821 movzb (%rdx), %ecx 2822 lea 1($inp), $inp 2823 mov %al, (%rdx) 2824 mov %cl, 16(%rdx) 2825 lea 1(%rdx), %rdx 2826 sub \$1,%ebx 2827 jnz .Lxts_dec_steal 2828 2829 movdqu ($out), @XMM[0] 2830 lea 0x20(%rbp), $arg1 2831 pxor @XMM[6], @XMM[0] 2832 lea 0x20(%rbp), $arg2 2833 movdqa @XMM[0], 0x20(%rbp) 2834 lea ($key), $arg3 2835 call asm_AES_decrypt # doesn't touch %xmm 2836 pxor 0x20(%rbp), @XMM[6] 2837 movdqu @XMM[6], ($out) 2838 2839 .Lxts_dec_ret: 2840 lea (%rsp), %rax 2841 pxor %xmm0, %xmm0 2842 .Lxts_dec_bzero: # wipe key schedule [if any] 2843 movdqa %xmm0, 0x00(%rax) 2844 movdqa %xmm0, 0x10(%rax) 2845 lea 0x20(%rax), %rax 2846 cmp %rax, %rbp 2847 ja .Lxts_dec_bzero 2848 2849 lea (%rbp),%rsp # restore %rsp 2850 ___ 2851 $code.=<<___ if ($win64); 2852 movaps 0x40(%rbp), %xmm6 2853 movaps 0x50(%rbp), %xmm7 2854 movaps 0x60(%rbp), %xmm8 2855 movaps 0x70(%rbp), %xmm9 2856 movaps 0x80(%rbp), %xmm10 2857 movaps 0x90(%rbp), %xmm11 2858 movaps 0xa0(%rbp), %xmm12 2859 movaps 0xb0(%rbp), %xmm13 2860 movaps 0xc0(%rbp), %xmm14 2861 movaps 0xd0(%rbp), %xmm15 2862 lea 0xa0(%rbp), %rsp 2863 ___ 2864 $code.=<<___; 2865 mov 0x48(%rsp), %r15 2866 mov 0x50(%rsp), %r14 2867 mov 0x58(%rsp), %r13 2868 mov 0x60(%rsp), %r12 2869 mov 0x68(%rsp), %rbx 2870 mov 0x70(%rsp), %rax 2871 lea 0x78(%rsp), %rsp 2872 mov %rax, %rbp 2873 .Lxts_dec_epilogue: 2874 ret 2875 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2876 ___ 2877 } 2878 $code.=<<___; 2879 .type _bsaes_const,\@object 2880 .align 64 2881 _bsaes_const: 2882 .LM0ISR: # InvShiftRows constants 2883 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2884 .LISRM0: 2885 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2886 .LISR: 2887 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2888 .LBS0: # bit-slice constants 2889 .quad 0x5555555555555555, 0x5555555555555555 2890 .LBS1: 2891 .quad 0x3333333333333333, 0x3333333333333333 2892 .LBS2: 2893 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2894 .LSR: # shiftrows constants 2895 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2896 .LSRM0: 2897 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2898 .LM0SR: 2899 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2900 .LSWPUP: # byte-swap upper dword 2901 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2902 .LSWPUPM0SR: 2903 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2904 .LADD1: # counter increment constants 2905 .quad 0x0000000000000000, 0x0000000100000000 2906 .LADD2: 2907 .quad 0x0000000000000000, 0x0000000200000000 2908 .LADD3: 2909 .quad 0x0000000000000000, 0x0000000300000000 2910 .LADD4: 2911 .quad 0x0000000000000000, 0x0000000400000000 2912 .LADD5: 2913 .quad 0x0000000000000000, 0x0000000500000000 2914 .LADD6: 2915 .quad 0x0000000000000000, 0x0000000600000000 2916 .LADD7: 2917 .quad 0x0000000000000000, 0x0000000700000000 2918 .LADD8: 2919 .quad 0x0000000000000000, 0x0000000800000000 2920 .Lxts_magic: 2921 .long 0x87,0,1,0 2922 .Lmasks: 2923 .quad 0x0101010101010101, 0x0101010101010101 2924 .quad 0x0202020202020202, 0x0202020202020202 2925 .quad 0x0404040404040404, 0x0404040404040404 2926 .quad 0x0808080808080808, 0x0808080808080808 2927 .LM0: 2928 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2929 .L63: 2930 .quad 0x6363636363636363, 0x6363636363636363 2931 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Ksper, Peter Schwabe, Andy Polyakov" 2932 .align 64 2933 .size _bsaes_const,.-_bsaes_const 2934 ___ 2935 2936 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2937 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 2938 if ($win64) { 2939 $rec="%rcx"; 2940 $frame="%rdx"; 2941 $context="%r8"; 2942 $disp="%r9"; 2943 2944 $code.=<<___; 2945 .extern __imp_RtlVirtualUnwind 2946 .type se_handler,\@abi-omnipotent 2947 .align 16 2948 se_handler: 2949 push %rsi 2950 push %rdi 2951 push %rbx 2952 push %rbp 2953 push %r12 2954 push %r13 2955 push %r14 2956 push %r15 2957 pushfq 2958 sub \$64,%rsp 2959 2960 mov 120($context),%rax # pull context->Rax 2961 mov 248($context),%rbx # pull context->Rip 2962 2963 mov 8($disp),%rsi # disp->ImageBase 2964 mov 56($disp),%r11 # disp->HandlerData 2965 2966 mov 0(%r11),%r10d # HandlerData[0] 2967 lea (%rsi,%r10),%r10 # prologue label 2968 cmp %r10,%rbx # context->Rip<prologue label 2969 jb .Lin_prologue 2970 2971 mov 152($context),%rax # pull context->Rsp 2972 2973 mov 4(%r11),%r10d # HandlerData[1] 2974 lea (%rsi,%r10),%r10 # epilogue label 2975 cmp %r10,%rbx # context->Rip>=epilogue label 2976 jae .Lin_prologue 2977 2978 mov 160($context),%rax # pull context->Rbp 2979 2980 lea 0x40(%rax),%rsi # %xmm save area 2981 lea 512($context),%rdi # &context.Xmm6 2982 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2983 .long 0xa548f3fc # cld; rep movsq 2984 lea 0xa0(%rax),%rax # adjust stack pointer 2985 2986 mov 0x70(%rax),%rbp 2987 mov 0x68(%rax),%rbx 2988 mov 0x60(%rax),%r12 2989 mov 0x58(%rax),%r13 2990 mov 0x50(%rax),%r14 2991 mov 0x48(%rax),%r15 2992 lea 0x78(%rax),%rax # adjust stack pointer 2993 mov %rbx,144($context) # restore context->Rbx 2994 mov %rbp,160($context) # restore context->Rbp 2995 mov %r12,216($context) # restore context->R12 2996 mov %r13,224($context) # restore context->R13 2997 mov %r14,232($context) # restore context->R14 2998 mov %r15,240($context) # restore context->R15 2999 3000 .Lin_prologue: 3001 mov %rax,152($context) # restore context->Rsp 3002 3003 mov 40($disp),%rdi # disp->ContextRecord 3004 mov $context,%rsi # context 3005 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3006 .long 0xa548f3fc # cld; rep movsq 3007 3008 mov $disp,%rsi 3009 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3010 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3011 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3012 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3013 mov 40(%rsi),%r10 # disp->ContextRecord 3014 lea 56(%rsi),%r11 # &disp->HandlerData 3015 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3016 mov %r10,32(%rsp) # arg5 3017 mov %r11,40(%rsp) # arg6 3018 mov %r12,48(%rsp) # arg7 3019 mov %rcx,56(%rsp) # arg8, (NULL) 3020 call *__imp_RtlVirtualUnwind(%rip) 3021 3022 mov \$1,%eax # ExceptionContinueSearch 3023 add \$64,%rsp 3024 popfq 3025 pop %r15 3026 pop %r14 3027 pop %r13 3028 pop %r12 3029 pop %rbp 3030 pop %rbx 3031 pop %rdi 3032 pop %rsi 3033 ret 3034 .size se_handler,.-se_handler 3035 3036 .section .pdata 3037 .align 4 3038 ___ 3039 $code.=<<___ if ($ecb); 3040 .rva .Lecb_enc_prologue 3041 .rva .Lecb_enc_epilogue 3042 .rva .Lecb_enc_info 3043 3044 .rva .Lecb_dec_prologue 3045 .rva .Lecb_dec_epilogue 3046 .rva .Lecb_dec_info 3047 ___ 3048 $code.=<<___; 3049 .rva .Lcbc_dec_prologue 3050 .rva .Lcbc_dec_epilogue 3051 .rva .Lcbc_dec_info 3052 3053 .rva .Lctr_enc_prologue 3054 .rva .Lctr_enc_epilogue 3055 .rva .Lctr_enc_info 3056 3057 .rva .Lxts_enc_prologue 3058 .rva .Lxts_enc_epilogue 3059 .rva .Lxts_enc_info 3060 3061 .rva .Lxts_dec_prologue 3062 .rva .Lxts_dec_epilogue 3063 .rva .Lxts_dec_info 3064 3065 .section .xdata 3066 .align 8 3067 ___ 3068 $code.=<<___ if ($ecb); 3069 .Lecb_enc_info: 3070 .byte 9,0,0,0 3071 .rva se_handler 3072 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3073 .Lecb_dec_info: 3074 .byte 9,0,0,0 3075 .rva se_handler 3076 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3077 ___ 3078 $code.=<<___; 3079 .Lcbc_dec_info: 3080 .byte 9,0,0,0 3081 .rva se_handler 3082 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3083 .Lctr_enc_info: 3084 .byte 9,0,0,0 3085 .rva se_handler 3086 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3087 .Lxts_enc_info: 3088 .byte 9,0,0,0 3089 .rva se_handler 3090 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3091 .Lxts_dec_info: 3092 .byte 9,0,0,0 3093 .rva se_handler 3094 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3095 ___ 3096 } 3097 3098 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 3099 3100 print $code; 3101 3102 close STDOUT; 3103