1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for ARMv8 AES instructions. The 11 # module is endian-agnostic in sense that it supports both big- and 12 # little-endian cases. As does it support both 32- and 64-bit modes 13 # of operation. Latter is achieved by limiting amount of utilized 14 # registers to 16, which implies additional instructions. This has 15 # no effect on mighty Apple A7, as results are literally equal to 16 # the theoretical estimates based on instruction latencies and issue 17 # rate. It remains to be seen how does it affect other platforms... 18 # 19 # Performance in cycles per byte processed with 128-bit key: 20 # 21 # CBC enc CBC dec CTR 22 # Apple A7 2.39 1.20 1.20 23 # Cortex-A5x n/a n/a n/a 24 25 $flavour = shift; 26 open STDOUT,">".shift; 27 28 $prefix="aes_v8"; 29 30 $code=<<___; 31 #include "arm_arch.h" 32 33 #if __ARM_ARCH__>=7 34 .text 35 ___ 36 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 37 $code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); 38 39 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 40 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 41 # maintain both 32- and 64-bit codes within single module and 42 # transliterate common code to either flavour with regex vodoo. 43 # 44 {{{ 45 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 46 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 47 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 48 49 50 $code.=<<___; 51 .align 5 52 rcon: 53 .long 0x01,0x01,0x01,0x01 54 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 55 .long 0x1b,0x1b,0x1b,0x1b 56 57 .globl ${prefix}_set_encrypt_key 58 .type ${prefix}_set_encrypt_key,%function 59 .align 5 60 ${prefix}_set_encrypt_key: 61 .Lenc_key: 62 ___ 63 $code.=<<___ if ($flavour =~ /64/); 64 stp x29,x30,[sp,#-16]! 65 add x29,sp,#0 66 ___ 67 $code.=<<___; 68 adr $ptr,rcon 69 cmp $bits,#192 70 71 veor $zero,$zero,$zero 72 vld1.8 {$in0},[$inp],#16 73 mov $bits,#8 // reuse $bits 74 vld1.32 {$rcon,$mask},[$ptr],#32 75 76 b.lt .Loop128 77 b.eq .L192 78 b .L256 79 80 .align 4 81 .Loop128: 82 vtbl.8 $key,{$in0},$mask 83 vext.8 $tmp,$zero,$in0,#12 84 vst1.32 {$in0},[$out],#16 85 aese $key,$zero 86 subs $bits,$bits,#1 87 88 veor $in0,$in0,$tmp 89 vext.8 $tmp,$zero,$tmp,#12 90 veor $in0,$in0,$tmp 91 vext.8 $tmp,$zero,$tmp,#12 92 veor $key,$key,$rcon 93 veor $in0,$in0,$tmp 94 vshl.u8 $rcon,$rcon,#1 95 veor $in0,$in0,$key 96 b.ne .Loop128 97 98 vld1.32 {$rcon},[$ptr] 99 100 vtbl.8 $key,{$in0},$mask 101 vext.8 $tmp,$zero,$in0,#12 102 vst1.32 {$in0},[$out],#16 103 aese $key,$zero 104 105 veor $in0,$in0,$tmp 106 vext.8 $tmp,$zero,$tmp,#12 107 veor $in0,$in0,$tmp 108 vext.8 $tmp,$zero,$tmp,#12 109 veor $key,$key,$rcon 110 veor $in0,$in0,$tmp 111 vshl.u8 $rcon,$rcon,#1 112 veor $in0,$in0,$key 113 114 vtbl.8 $key,{$in0},$mask 115 vext.8 $tmp,$zero,$in0,#12 116 vst1.32 {$in0},[$out],#16 117 aese $key,$zero 118 119 veor $in0,$in0,$tmp 120 vext.8 $tmp,$zero,$tmp,#12 121 veor $in0,$in0,$tmp 122 vext.8 $tmp,$zero,$tmp,#12 123 veor $key,$key,$rcon 124 veor $in0,$in0,$tmp 125 veor $in0,$in0,$key 126 vst1.32 {$in0},[$out] 127 add $out,$out,#0x50 128 129 mov $rounds,#10 130 b .Ldone 131 132 .align 4 133 .L192: 134 vld1.8 {$in1},[$inp],#8 135 vmov.i8 $key,#8 // borrow $key 136 vst1.32 {$in0},[$out],#16 137 vsub.i8 $mask,$mask,$key // adjust the mask 138 139 .Loop192: 140 vtbl.8 $key,{$in1},$mask 141 vext.8 $tmp,$zero,$in0,#12 142 vst1.32 {$in1},[$out],#8 143 aese $key,$zero 144 subs $bits,$bits,#1 145 146 veor $in0,$in0,$tmp 147 vext.8 $tmp,$zero,$tmp,#12 148 veor $in0,$in0,$tmp 149 vext.8 $tmp,$zero,$tmp,#12 150 veor $in0,$in0,$tmp 151 152 vdup.32 $tmp,${in0}[3] 153 veor $tmp,$tmp,$in1 154 veor $key,$key,$rcon 155 vext.8 $in1,$zero,$in1,#12 156 vshl.u8 $rcon,$rcon,#1 157 veor $in1,$in1,$tmp 158 veor $in0,$in0,$key 159 veor $in1,$in1,$key 160 vst1.32 {$in0},[$out],#16 161 b.ne .Loop192 162 163 mov $rounds,#12 164 add $out,$out,#0x20 165 b .Ldone 166 167 .align 4 168 .L256: 169 vld1.8 {$in1},[$inp] 170 mov $bits,#7 171 mov $rounds,#14 172 vst1.32 {$in0},[$out],#16 173 174 .Loop256: 175 vtbl.8 $key,{$in1},$mask 176 vext.8 $tmp,$zero,$in0,#12 177 vst1.32 {$in1},[$out],#16 178 aese $key,$zero 179 subs $bits,$bits,#1 180 181 veor $in0,$in0,$tmp 182 vext.8 $tmp,$zero,$tmp,#12 183 veor $in0,$in0,$tmp 184 vext.8 $tmp,$zero,$tmp,#12 185 veor $key,$key,$rcon 186 veor $in0,$in0,$tmp 187 vshl.u8 $rcon,$rcon,#1 188 veor $in0,$in0,$key 189 vst1.32 {$in0},[$out],#16 190 b.eq .Ldone 191 192 vdup.32 $key,${in0}[3] // just splat 193 vext.8 $tmp,$zero,$in1,#12 194 aese $key,$zero 195 196 veor $in1,$in1,$tmp 197 vext.8 $tmp,$zero,$tmp,#12 198 veor $in1,$in1,$tmp 199 vext.8 $tmp,$zero,$tmp,#12 200 veor $in1,$in1,$tmp 201 202 veor $in1,$in1,$key 203 b .Loop256 204 205 .Ldone: 206 str $rounds,[$out] 207 208 eor x0,x0,x0 // return value 209 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 210 ret 211 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 212 213 .globl ${prefix}_set_decrypt_key 214 .type ${prefix}_set_decrypt_key,%function 215 .align 5 216 ${prefix}_set_decrypt_key: 217 ___ 218 $code.=<<___ if ($flavour =~ /64/); 219 stp x29,x30,[sp,#-16]! 220 add x29,sp,#0 221 ___ 222 $code.=<<___ if ($flavour !~ /64/); 223 stmdb sp!,{r4,lr} 224 ___ 225 $code.=<<___; 226 bl .Lenc_key 227 228 sub $out,$out,#240 // restore original $out 229 mov x4,#-16 230 add $inp,$out,x12,lsl#4 // end of key schedule 231 232 vld1.32 {v0.16b},[$out] 233 vld1.32 {v1.16b},[$inp] 234 vst1.32 {v0.16b},[$inp],x4 235 vst1.32 {v1.16b},[$out],#16 236 237 .Loop_imc: 238 vld1.32 {v0.16b},[$out] 239 vld1.32 {v1.16b},[$inp] 240 aesimc v0.16b,v0.16b 241 aesimc v1.16b,v1.16b 242 vst1.32 {v0.16b},[$inp],x4 243 vst1.32 {v1.16b},[$out],#16 244 cmp $inp,$out 245 b.hi .Loop_imc 246 247 vld1.32 {v0.16b},[$out] 248 aesimc v0.16b,v0.16b 249 vst1.32 {v0.16b},[$inp] 250 251 eor x0,x0,x0 // return value 252 ___ 253 $code.=<<___ if ($flavour !~ /64/); 254 ldmia sp!,{r4,pc} 255 ___ 256 $code.=<<___ if ($flavour =~ /64/); 257 ldp x29,x30,[sp],#16 258 ret 259 ___ 260 $code.=<<___; 261 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 262 ___ 263 }}} 264 {{{ 265 sub gen_block () { 266 my $dir = shift; 267 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 268 my ($inp,$out,$key)=map("x$_",(0..2)); 269 my $rounds="w3"; 270 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 271 272 $code.=<<___; 273 .globl ${prefix}_${dir}crypt 274 .type ${prefix}_${dir}crypt,%function 275 .align 5 276 ${prefix}_${dir}crypt: 277 ldr $rounds,[$key,#240] 278 vld1.32 {$rndkey0},[$key],#16 279 vld1.8 {$inout},[$inp] 280 sub $rounds,$rounds,#2 281 vld1.32 {$rndkey1},[$key],#16 282 283 .Loop_${dir}c: 284 aes$e $inout,$rndkey0 285 vld1.32 {$rndkey0},[$key],#16 286 aes$mc $inout,$inout 287 subs $rounds,$rounds,#2 288 aes$e $inout,$rndkey1 289 vld1.32 {$rndkey1},[$key],#16 290 aes$mc $inout,$inout 291 b.gt .Loop_${dir}c 292 293 aes$e $inout,$rndkey0 294 vld1.32 {$rndkey0},[$key] 295 aes$mc $inout,$inout 296 aes$e $inout,$rndkey1 297 veor $inout,$inout,$rndkey0 298 299 vst1.8 {$inout},[$out] 300 ret 301 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 302 ___ 303 } 304 &gen_block("en"); 305 &gen_block("de"); 306 }}} 307 {{{ 308 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 309 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 310 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 311 312 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 313 314 ### q8-q15 preloaded key schedule 315 316 $code.=<<___; 317 .globl ${prefix}_cbc_encrypt 318 .type ${prefix}_cbc_encrypt,%function 319 .align 5 320 ${prefix}_cbc_encrypt: 321 ___ 322 $code.=<<___ if ($flavour =~ /64/); 323 stp x29,x30,[sp,#-16]! 324 add x29,sp,#0 325 ___ 326 $code.=<<___ if ($flavour !~ /64/); 327 mov ip,sp 328 stmdb sp!,{r4-r8,lr} 329 vstmdb sp!,{d8-d15} @ ABI specification says so 330 ldmia ip,{r4-r5} @ load remaining args 331 ___ 332 $code.=<<___; 333 subs $len,$len,#16 334 mov $step,#16 335 b.lo .Lcbc_abort 336 cclr $step,eq 337 338 cmp $enc,#0 // en- or decrypting? 339 ldr $rounds,[$key,#240] 340 and $len,$len,#-16 341 vld1.8 {$ivec},[$ivp] 342 vld1.8 {$dat},[$inp],$step 343 344 vld1.32 {q8-q9},[$key] // load key schedule... 345 sub $rounds,$rounds,#6 346 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 347 sub $rounds,$rounds,#2 348 vld1.32 {q10-q11},[$key_],#32 349 vld1.32 {q12-q13},[$key_],#32 350 vld1.32 {q14-q15},[$key_],#32 351 vld1.32 {$rndlast},[$key_] 352 353 add $key_,$key,#32 354 mov $cnt,$rounds 355 b.eq .Lcbc_dec 356 357 cmp $rounds,#2 358 veor $dat,$dat,$ivec 359 veor $rndzero_n_last,q8,$rndlast 360 b.eq .Lcbc_enc128 361 362 .Loop_cbc_enc: 363 aese $dat,q8 364 vld1.32 {q8},[$key_],#16 365 aesmc $dat,$dat 366 subs $cnt,$cnt,#2 367 aese $dat,q9 368 vld1.32 {q9},[$key_],#16 369 aesmc $dat,$dat 370 b.gt .Loop_cbc_enc 371 372 aese $dat,q8 373 aesmc $dat,$dat 374 subs $len,$len,#16 375 aese $dat,q9 376 aesmc $dat,$dat 377 cclr $step,eq 378 aese $dat,q10 379 aesmc $dat,$dat 380 add $key_,$key,#16 381 aese $dat,q11 382 aesmc $dat,$dat 383 vld1.8 {q8},[$inp],$step 384 aese $dat,q12 385 aesmc $dat,$dat 386 veor q8,q8,$rndzero_n_last 387 aese $dat,q13 388 aesmc $dat,$dat 389 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 390 aese $dat,q14 391 aesmc $dat,$dat 392 aese $dat,q15 393 394 mov $cnt,$rounds 395 veor $ivec,$dat,$rndlast 396 vst1.8 {$ivec},[$out],#16 397 b.hs .Loop_cbc_enc 398 399 b .Lcbc_done 400 401 .align 5 402 .Lcbc_enc128: 403 vld1.32 {$in0-$in1},[$key_] 404 aese $dat,q8 405 aesmc $dat,$dat 406 b .Lenter_cbc_enc128 407 .Loop_cbc_enc128: 408 aese $dat,q8 409 aesmc $dat,$dat 410 vst1.8 {$ivec},[$out],#16 411 .Lenter_cbc_enc128: 412 aese $dat,q9 413 aesmc $dat,$dat 414 subs $len,$len,#16 415 aese $dat,$in0 416 aesmc $dat,$dat 417 cclr $step,eq 418 aese $dat,$in1 419 aesmc $dat,$dat 420 aese $dat,q10 421 aesmc $dat,$dat 422 aese $dat,q11 423 aesmc $dat,$dat 424 vld1.8 {q8},[$inp],$step 425 aese $dat,q12 426 aesmc $dat,$dat 427 aese $dat,q13 428 aesmc $dat,$dat 429 aese $dat,q14 430 aesmc $dat,$dat 431 veor q8,q8,$rndzero_n_last 432 aese $dat,q15 433 veor $ivec,$dat,$rndlast 434 b.hs .Loop_cbc_enc128 435 436 vst1.8 {$ivec},[$out],#16 437 b .Lcbc_done 438 439 .align 5 440 .Lcbc_dec128: 441 vld1.32 {$tmp0-$tmp1},[$key_] 442 veor $ivec,$ivec,$rndlast 443 veor $in0,$dat0,$rndlast 444 mov $step1,$step 445 446 .Loop2x_cbc_dec128: 447 aesd $dat0,q8 448 aesd $dat1,q8 449 aesimc $dat0,$dat0 450 aesimc $dat1,$dat1 451 subs $len,$len,#32 452 aesd $dat0,q9 453 aesd $dat1,q9 454 aesimc $dat0,$dat0 455 aesimc $dat1,$dat1 456 cclr $step,lo 457 aesd $dat0,$tmp0 458 aesd $dat1,$tmp0 459 aesimc $dat0,$dat0 460 aesimc $dat1,$dat1 461 cclr $step1,ls 462 aesd $dat0,$tmp1 463 aesd $dat1,$tmp1 464 aesimc $dat0,$dat0 465 aesimc $dat1,$dat1 466 aesd $dat0,q10 467 aesd $dat1,q10 468 aesimc $dat0,$dat0 469 aesimc $dat1,$dat1 470 aesd $dat0,q11 471 aesd $dat1,q11 472 aesimc $dat0,$dat0 473 aesimc $dat1,$dat1 474 aesd $dat0,q12 475 aesd $dat1,q12 476 aesimc $dat0,$dat0 477 aesimc $dat1,$dat1 478 aesd $dat0,q13 479 aesd $dat1,q13 480 aesimc $dat0,$dat0 481 aesimc $dat1,$dat1 482 aesd $dat0,q14 483 aesd $dat1,q14 484 aesimc $dat0,$dat0 485 aesimc $dat1,$dat1 486 aesd $dat0,q15 487 aesd $dat1,q15 488 489 veor $ivec,$ivec,$dat0 490 vld1.8 {$dat0},[$inp],$step 491 veor $in0,$in0,$dat1 492 vld1.8 {$dat1},[$inp],$step1 493 vst1.8 {$ivec},[$out],#16 494 veor $ivec,$in1,$rndlast 495 vst1.8 {$in0},[$out],#16 496 veor $in0,$dat0,$rndlast 497 vorr $in1,$dat1,$dat1 498 b.hs .Loop2x_cbc_dec128 499 500 adds $len,$len,#32 501 veor $ivec,$ivec,$rndlast 502 b.eq .Lcbc_done 503 veor $in0,$in0,$rndlast 504 b .Lcbc_dec_tail 505 506 .align 5 507 .Lcbc_dec: 508 subs $len,$len,#16 509 vorr $in0,$dat,$dat 510 b.lo .Lcbc_dec_tail 511 512 cclr $step,eq 513 cmp $rounds,#2 514 vld1.8 {$dat1},[$inp],$step 515 vorr $in1,$dat1,$dat1 516 b.eq .Lcbc_dec128 517 518 .Loop2x_cbc_dec: 519 aesd $dat0,q8 520 aesd $dat1,q8 521 vld1.32 {q8},[$key_],#16 522 aesimc $dat0,$dat0 523 aesimc $dat1,$dat1 524 subs $cnt,$cnt,#2 525 aesd $dat0,q9 526 aesd $dat1,q9 527 vld1.32 {q9},[$key_],#16 528 aesimc $dat0,$dat0 529 aesimc $dat1,$dat1 530 b.gt .Loop2x_cbc_dec 531 532 aesd $dat0,q8 533 aesd $dat1,q8 534 aesimc $dat0,$dat0 535 aesimc $dat1,$dat1 536 veor $tmp0,$ivec,$rndlast 537 veor $tmp1,$in0,$rndlast 538 aesd $dat0,q9 539 aesd $dat1,q9 540 aesimc $dat0,$dat0 541 aesimc $dat1,$dat1 542 vorr $ivec,$in1,$in1 543 subs $len,$len,#32 544 aesd $dat0,q10 545 aesd $dat1,q10 546 aesimc $dat0,$dat0 547 cclr $step,lo 548 aesimc $dat1,$dat1 549 mov $key_,$key 550 aesd $dat0,q11 551 aesd $dat1,q11 552 aesimc $dat0,$dat0 553 vld1.8 {$in0},[$inp],$step 554 aesimc $dat1,$dat1 555 cclr $step,ls 556 aesd $dat0,q12 557 aesd $dat1,q12 558 aesimc $dat0,$dat0 559 aesimc $dat1,$dat1 560 vld1.8 {$in1},[$inp],$step 561 aesd $dat0,q13 562 aesd $dat1,q13 563 aesimc $dat0,$dat0 564 aesimc $dat1,$dat1 565 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 566 aesd $dat0,q14 567 aesd $dat1,q14 568 aesimc $dat0,$dat0 569 aesimc $dat1,$dat1 570 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 571 aesd $dat0,q15 572 aesd $dat1,q15 573 574 mov $cnt,$rounds 575 veor $tmp0,$tmp0,$dat0 576 veor $tmp1,$tmp1,$dat1 577 vorr $dat0,$in0,$in0 578 vst1.8 {$tmp0},[$out],#16 579 vorr $dat1,$in1,$in1 580 vst1.8 {$tmp1},[$out],#16 581 b.hs .Loop2x_cbc_dec 582 583 adds $len,$len,#32 584 b.eq .Lcbc_done 585 586 .Lcbc_dec_tail: 587 aesd $dat,q8 588 vld1.32 {q8},[$key_],#16 589 aesimc $dat,$dat 590 subs $cnt,$cnt,#2 591 aesd $dat,q9 592 vld1.32 {q9},[$key_],#16 593 aesimc $dat,$dat 594 b.gt .Lcbc_dec_tail 595 596 aesd $dat,q8 597 aesimc $dat,$dat 598 aesd $dat,q9 599 aesimc $dat,$dat 600 veor $tmp,$ivec,$rndlast 601 aesd $dat,q10 602 aesimc $dat,$dat 603 vorr $ivec,$in0,$in0 604 aesd $dat,q11 605 aesimc $dat,$dat 606 aesd $dat,q12 607 aesimc $dat,$dat 608 aesd $dat,q13 609 aesimc $dat,$dat 610 aesd $dat,q14 611 aesimc $dat,$dat 612 aesd $dat,q15 613 614 veor $tmp,$tmp,$dat 615 vst1.8 {$tmp},[$out],#16 616 617 .Lcbc_done: 618 vst1.8 {$ivec},[$ivp] 619 .Lcbc_abort: 620 ___ 621 $code.=<<___ if ($flavour !~ /64/); 622 vldmia sp!,{d8-d15} 623 ldmia sp!,{r4-r8,pc} 624 ___ 625 $code.=<<___ if ($flavour =~ /64/); 626 ldr x29,[sp],#16 627 ret 628 ___ 629 $code.=<<___; 630 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 631 ___ 632 }}} 633 {{{ 634 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 635 my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); 636 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 637 638 my ($dat,$tmp)=($dat0,$tmp0); 639 640 ### q8-q15 preloaded key schedule 641 642 $code.=<<___; 643 .globl ${prefix}_ctr32_encrypt_blocks 644 .type ${prefix}_ctr32_encrypt_blocks,%function 645 .align 5 646 ${prefix}_ctr32_encrypt_blocks: 647 ___ 648 $code.=<<___ if ($flavour =~ /64/); 649 stp x29,x30,[sp,#-16]! 650 add x29,sp,#0 651 ___ 652 $code.=<<___ if ($flavour !~ /64/); 653 mov ip,sp 654 stmdb sp!,{r4-r10,lr} 655 vstmdb sp!,{d8-d15} @ ABI specification says so 656 ldr r4, [ip] @ load remaining arg 657 ___ 658 $code.=<<___; 659 ldr $rounds,[$key,#240] 660 661 ldr $ctr, [$ivp, #12] 662 vld1.32 {$dat0},[$ivp] 663 664 vld1.32 {q8-q9},[$key] // load key schedule... 665 sub $rounds,$rounds,#6 666 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 667 sub $rounds,$rounds,#2 668 vld1.32 {q10-q11},[$key_],#32 669 vld1.32 {q12-q13},[$key_],#32 670 vld1.32 {q14-q15},[$key_],#32 671 vld1.32 {$rndlast},[$key_] 672 673 add $key_,$key,#32 674 mov $cnt,$rounds 675 676 subs $len,$len,#2 677 b.lo .Lctr32_tail 678 679 #ifndef __ARMEB__ 680 rev $ctr, $ctr 681 #endif 682 vorr $dat1,$dat0,$dat0 683 add $ctr, $ctr, #1 684 vorr $ivec,$dat0,$dat0 685 rev $tctr1, $ctr 686 cmp $rounds,#2 687 vmov.32 ${dat1}[3],$tctr1 688 b.eq .Lctr32_128 689 690 .Loop2x_ctr32: 691 aese $dat0,q8 692 aese $dat1,q8 693 vld1.32 {q8},[$key_],#16 694 aesmc $dat0,$dat0 695 aesmc $dat1,$dat1 696 subs $cnt,$cnt,#2 697 aese $dat0,q9 698 aese $dat1,q9 699 vld1.32 {q9},[$key_],#16 700 aesmc $dat0,$dat0 701 aesmc $dat1,$dat1 702 b.gt .Loop2x_ctr32 703 704 aese $dat0,q8 705 aese $dat1,q8 706 aesmc $tmp0,$dat0 707 vorr $dat0,$ivec,$ivec 708 aesmc $tmp1,$dat1 709 vorr $dat1,$ivec,$ivec 710 aese $tmp0,q9 711 aese $tmp1,q9 712 vld1.8 {$in0},[$inp],#16 713 aesmc $tmp0,$tmp0 714 vld1.8 {$in1},[$inp],#16 715 aesmc $tmp1,$tmp1 716 add $ctr,$ctr,#1 717 aese $tmp0,q10 718 aese $tmp1,q10 719 rev $tctr,$ctr 720 aesmc $tmp0,$tmp0 721 aesmc $tmp1,$tmp1 722 add $ctr,$ctr,#1 723 aese $tmp0,q11 724 aese $tmp1,q11 725 veor $in0,$in0,$rndlast 726 rev $tctr1,$ctr 727 aesmc $tmp0,$tmp0 728 aesmc $tmp1,$tmp1 729 veor $in1,$in1,$rndlast 730 mov $key_,$key 731 aese $tmp0,q12 732 aese $tmp1,q12 733 subs $len,$len,#2 734 aesmc $tmp0,$tmp0 735 aesmc $tmp1,$tmp1 736 vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] 737 aese $tmp0,q13 738 aese $tmp1,q13 739 aesmc $tmp0,$tmp0 740 aesmc $tmp1,$tmp1 741 aese $tmp0,q14 742 aese $tmp1,q14 743 vmov.32 ${dat0}[3], $tctr 744 aesmc $tmp0,$tmp0 745 vmov.32 ${dat1}[3], $tctr1 746 aesmc $tmp1,$tmp1 747 aese $tmp0,q15 748 aese $tmp1,q15 749 750 mov $cnt,$rounds 751 veor $in0,$in0,$tmp0 752 veor $in1,$in1,$tmp1 753 vst1.8 {$in0},[$out],#16 754 vst1.8 {$in1},[$out],#16 755 b.hs .Loop2x_ctr32 756 757 adds $len,$len,#2 758 b.eq .Lctr32_done 759 b .Lctr32_tail 760 761 .Lctr32_128: 762 vld1.32 {$tmp0-$tmp1},[$key_] 763 764 .Loop2x_ctr32_128: 765 aese $dat0,q8 766 aese $dat1,q8 767 aesmc $dat0,$dat0 768 vld1.8 {$in0},[$inp],#16 769 aesmc $dat1,$dat1 770 vld1.8 {$in1},[$inp],#16 771 aese $dat0,q9 772 aese $dat1,q9 773 add $ctr,$ctr,#1 774 aesmc $dat0,$dat0 775 aesmc $dat1,$dat1 776 rev $tctr,$ctr 777 aese $dat0,$tmp0 778 aese $dat1,$tmp0 779 add $ctr,$ctr,#1 780 aesmc $dat0,$dat0 781 aesmc $dat1,$dat1 782 rev $tctr1,$ctr 783 aese $dat0,$tmp1 784 aese $dat1,$tmp1 785 subs $len,$len,#2 786 aesmc $dat0,$dat0 787 aesmc $dat1,$dat1 788 aese $dat0,q10 789 aese $dat1,q10 790 aesmc $dat0,$dat0 791 aesmc $dat1,$dat1 792 aese $dat0,q11 793 aese $dat1,q11 794 aesmc $dat0,$dat0 795 aesmc $dat1,$dat1 796 aese $dat0,q12 797 aese $dat1,q12 798 aesmc $dat0,$dat0 799 aesmc $dat1,$dat1 800 aese $dat0,q13 801 aese $dat1,q13 802 aesmc $dat0,$dat0 803 aesmc $dat1,$dat1 804 aese $dat0,q14 805 aese $dat1,q14 806 aesmc $dat0,$dat0 807 aesmc $dat1,$dat1 808 veor $in0,$in0,$rndlast 809 aese $dat0,q15 810 veor $in1,$in1,$rndlast 811 aese $dat1,q15 812 813 veor $in0,$in0,$dat0 814 vorr $dat0,$ivec,$ivec 815 veor $in1,$in1,$dat1 816 vorr $dat1,$ivec,$ivec 817 vst1.8 {$in0},[$out],#16 818 vmov.32 ${dat0}[3], $tctr 819 vst1.8 {$in1},[$out],#16 820 vmov.32 ${dat1}[3], $tctr1 821 b.hs .Loop2x_ctr32_128 822 823 adds $len,$len,#2 824 b.eq .Lctr32_done 825 826 .Lctr32_tail: 827 aese $dat,q8 828 vld1.32 {q8},[$key_],#16 829 aesmc $dat,$dat 830 subs $cnt,$cnt,#2 831 aese $dat,q9 832 vld1.32 {q9},[$key_],#16 833 aesmc $dat,$dat 834 b.gt .Lctr32_tail 835 836 aese $dat,q8 837 aesmc $dat,$dat 838 aese $dat,q9 839 aesmc $dat,$dat 840 vld1.8 {$in0},[$inp] 841 aese $dat,q10 842 aesmc $dat,$dat 843 aese $dat,q11 844 aesmc $dat,$dat 845 aese $dat,q12 846 aesmc $dat,$dat 847 aese $dat,q13 848 aesmc $dat,$dat 849 aese $dat,q14 850 aesmc $dat,$dat 851 veor $in0,$in0,$rndlast 852 aese $dat,q15 853 854 veor $in0,$in0,$dat 855 vst1.8 {$in0},[$out] 856 857 .Lctr32_done: 858 ___ 859 $code.=<<___ if ($flavour !~ /64/); 860 vldmia sp!,{d8-d15} 861 ldmia sp!,{r4-r10,pc} 862 ___ 863 $code.=<<___ if ($flavour =~ /64/); 864 ldr x29,[sp],#16 865 ret 866 ___ 867 $code.=<<___; 868 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 869 ___ 870 }}} 871 $code.=<<___; 872 #endif 873 ___ 874 ######################################## 875 if ($flavour =~ /64/) { ######## 64-bit code 876 my %opcode = ( 877 "aesd" => 0x4e285800, "aese" => 0x4e284800, 878 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 879 880 local *unaes = sub { 881 my ($mnemonic,$arg)=@_; 882 883 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 884 sprintf ".inst\t0x%08x\t//%s %s", 885 $opcode{$mnemonic}|$1|($2<<5), 886 $mnemonic,$arg; 887 }; 888 889 foreach(split("\n",$code)) { 890 s/\`([^\`]*)\`/eval($1)/geo; 891 892 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 893 s/@\s/\/\//o; # old->new style commentary 894 895 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 896 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 897 s/vmov\.i8/movi/o or # fix up legacy mnemonics 898 s/vext\.8/ext/o or 899 s/vrev32\.8/rev32/o or 900 s/vtst\.8/cmtst/o or 901 s/vshr/ushr/o or 902 s/^(\s+)v/$1/o or # strip off v prefix 903 s/\bbx\s+lr\b/ret/o; 904 905 # fix up remainig legacy suffixes 906 s/\.[ui]?8//o; 907 m/\],#8/o and s/\.16b/\.8b/go; 908 s/\.[ui]?32//o and s/\.16b/\.4s/go; 909 s/\.[ui]?64//o and s/\.16b/\.2d/go; 910 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 911 912 print $_,"\n"; 913 } 914 } else { ######## 32-bit code 915 my %opcode = ( 916 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 917 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 918 919 local *unaes = sub { 920 my ($mnemonic,$arg)=@_; 921 922 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 923 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 924 |(($2&7)<<1) |(($2&8)<<2); 925 # since ARMv7 instructions are always encoded little-endian. 926 # correct solution is to use .inst directive, but older 927 # assemblers don't implement it:-( 928 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 929 $word&0xff,($word>>8)&0xff, 930 ($word>>16)&0xff,($word>>24)&0xff, 931 $mnemonic,$arg; 932 } 933 }; 934 935 sub unvtbl { 936 my $arg=shift; 937 938 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 939 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 940 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 941 } 942 943 sub unvdup32 { 944 my $arg=shift; 945 946 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 947 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 948 } 949 950 sub unvmov32 { 951 my $arg=shift; 952 953 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 954 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 955 } 956 957 foreach(split("\n",$code)) { 958 s/\`([^\`]*)\`/eval($1)/geo; 959 960 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 961 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 962 s/\/\/\s?/@ /o; # new->old style commentary 963 964 # fix up remainig new-style suffixes 965 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 966 s/\],#[0-9]+/]!/o; 967 968 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 969 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 970 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 971 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 972 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 973 s/^(\s+)b\./$1b/o or 974 s/^(\s+)ret/$1bx\tlr/o; 975 976 print $_,"\n"; 977 } 978 } 979 980 close STDOUT; 981