1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for ARMv8 AES instructions. The 11 # module is endian-agnostic in sense that it supports both big- and 12 # little-endian cases. As does it support both 32- and 64-bit modes 13 # of operation. Latter is achieved by limiting amount of utilized 14 # registers to 16, which implies additional NEON load and integer 15 # instructions. This has no effect on mighty Apple A7, where results 16 # are literally equal to the theoretical estimates based on AES 17 # instruction latencies and issue rates. On Cortex-A53, an in-order 18 # execution core, this costs up to 10-15%, which is partially 19 # compensated by implementing dedicated code path for 128-bit 20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance 21 # seems to be limited by sheer amount of NEON instructions... 22 # 23 # Performance in cycles per byte processed with 128-bit key: 24 # 25 # CBC enc CBC dec CTR 26 # Apple A7 2.39 1.20 1.20 27 # Cortex-A53 1.32 1.29 1.46 28 # Cortex-A57(*) 1.95 0.85 0.93 29 # Denver 1.96 0.86 0.80 30 # 31 # (*) original 3.64/1.34/1.32 results were for r0p0 revision 32 # and are still same even for updated module; 33 34 $flavour = shift; 35 $output = shift; 36 37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 40 die "can't locate arm-xlate.pl"; 41 42 open OUT,"| \"$^X\" $xlate $flavour $output"; 43 *STDOUT=*OUT; 44 45 $prefix="aes_v8"; 46 47 $code=<<___; 48 #include <openssl/arm_arch.h> 49 50 #if __ARM_MAX_ARCH__>=7 51 .text 52 ___ 53 $code.=<<___ if ($flavour =~ /64/); 54 #if !defined(__clang__) 55 .arch armv8-a+crypto 56 #endif 57 ___ 58 $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); 59 #^^^^^^ this is done to simplify adoption by not depending 60 # on latest binutils. 61 62 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 63 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 64 # maintain both 32- and 64-bit codes within single module and 65 # transliterate common code to either flavour with regex vodoo. 66 # 67 {{{ 68 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 69 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 70 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 71 72 73 $code.=<<___; 74 .align 5 75 .Lrcon: 76 .long 0x01,0x01,0x01,0x01 77 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 78 .long 0x1b,0x1b,0x1b,0x1b 79 80 .globl ${prefix}_set_encrypt_key 81 .type ${prefix}_set_encrypt_key,%function 82 .align 5 83 ${prefix}_set_encrypt_key: 84 .Lenc_key: 85 ___ 86 $code.=<<___ if ($flavour =~ /64/); 87 stp x29,x30,[sp,#-16]! 88 add x29,sp,#0 89 ___ 90 $code.=<<___; 91 mov $ptr,#-1 92 cmp $inp,#0 93 b.eq .Lenc_key_abort 94 cmp $out,#0 95 b.eq .Lenc_key_abort 96 mov $ptr,#-2 97 cmp $bits,#128 98 b.lt .Lenc_key_abort 99 cmp $bits,#256 100 b.gt .Lenc_key_abort 101 tst $bits,#0x3f 102 b.ne .Lenc_key_abort 103 104 adr $ptr,.Lrcon 105 cmp $bits,#192 106 107 veor $zero,$zero,$zero 108 vld1.8 {$in0},[$inp],#16 109 mov $bits,#8 // reuse $bits 110 vld1.32 {$rcon,$mask},[$ptr],#32 111 112 b.lt .Loop128 113 b.eq .L192 114 b .L256 115 116 .align 4 117 .Loop128: 118 vtbl.8 $key,{$in0},$mask 119 vext.8 $tmp,$zero,$in0,#12 120 vst1.32 {$in0},[$out],#16 121 aese $key,$zero 122 subs $bits,$bits,#1 123 124 veor $in0,$in0,$tmp 125 vext.8 $tmp,$zero,$tmp,#12 126 veor $in0,$in0,$tmp 127 vext.8 $tmp,$zero,$tmp,#12 128 veor $key,$key,$rcon 129 veor $in0,$in0,$tmp 130 vshl.u8 $rcon,$rcon,#1 131 veor $in0,$in0,$key 132 b.ne .Loop128 133 134 vld1.32 {$rcon},[$ptr] 135 136 vtbl.8 $key,{$in0},$mask 137 vext.8 $tmp,$zero,$in0,#12 138 vst1.32 {$in0},[$out],#16 139 aese $key,$zero 140 141 veor $in0,$in0,$tmp 142 vext.8 $tmp,$zero,$tmp,#12 143 veor $in0,$in0,$tmp 144 vext.8 $tmp,$zero,$tmp,#12 145 veor $key,$key,$rcon 146 veor $in0,$in0,$tmp 147 vshl.u8 $rcon,$rcon,#1 148 veor $in0,$in0,$key 149 150 vtbl.8 $key,{$in0},$mask 151 vext.8 $tmp,$zero,$in0,#12 152 vst1.32 {$in0},[$out],#16 153 aese $key,$zero 154 155 veor $in0,$in0,$tmp 156 vext.8 $tmp,$zero,$tmp,#12 157 veor $in0,$in0,$tmp 158 vext.8 $tmp,$zero,$tmp,#12 159 veor $key,$key,$rcon 160 veor $in0,$in0,$tmp 161 veor $in0,$in0,$key 162 vst1.32 {$in0},[$out] 163 add $out,$out,#0x50 164 165 mov $rounds,#10 166 b .Ldone 167 168 .align 4 169 .L192: 170 vld1.8 {$in1},[$inp],#8 171 vmov.i8 $key,#8 // borrow $key 172 vst1.32 {$in0},[$out],#16 173 vsub.i8 $mask,$mask,$key // adjust the mask 174 175 .Loop192: 176 vtbl.8 $key,{$in1},$mask 177 vext.8 $tmp,$zero,$in0,#12 178 vst1.32 {$in1},[$out],#8 179 aese $key,$zero 180 subs $bits,$bits,#1 181 182 veor $in0,$in0,$tmp 183 vext.8 $tmp,$zero,$tmp,#12 184 veor $in0,$in0,$tmp 185 vext.8 $tmp,$zero,$tmp,#12 186 veor $in0,$in0,$tmp 187 188 vdup.32 $tmp,${in0}[3] 189 veor $tmp,$tmp,$in1 190 veor $key,$key,$rcon 191 vext.8 $in1,$zero,$in1,#12 192 vshl.u8 $rcon,$rcon,#1 193 veor $in1,$in1,$tmp 194 veor $in0,$in0,$key 195 veor $in1,$in1,$key 196 vst1.32 {$in0},[$out],#16 197 b.ne .Loop192 198 199 mov $rounds,#12 200 add $out,$out,#0x20 201 b .Ldone 202 203 .align 4 204 .L256: 205 vld1.8 {$in1},[$inp] 206 mov $bits,#7 207 mov $rounds,#14 208 vst1.32 {$in0},[$out],#16 209 210 .Loop256: 211 vtbl.8 $key,{$in1},$mask 212 vext.8 $tmp,$zero,$in0,#12 213 vst1.32 {$in1},[$out],#16 214 aese $key,$zero 215 subs $bits,$bits,#1 216 217 veor $in0,$in0,$tmp 218 vext.8 $tmp,$zero,$tmp,#12 219 veor $in0,$in0,$tmp 220 vext.8 $tmp,$zero,$tmp,#12 221 veor $key,$key,$rcon 222 veor $in0,$in0,$tmp 223 vshl.u8 $rcon,$rcon,#1 224 veor $in0,$in0,$key 225 vst1.32 {$in0},[$out],#16 226 b.eq .Ldone 227 228 vdup.32 $key,${in0}[3] // just splat 229 vext.8 $tmp,$zero,$in1,#12 230 aese $key,$zero 231 232 veor $in1,$in1,$tmp 233 vext.8 $tmp,$zero,$tmp,#12 234 veor $in1,$in1,$tmp 235 vext.8 $tmp,$zero,$tmp,#12 236 veor $in1,$in1,$tmp 237 238 veor $in1,$in1,$key 239 b .Loop256 240 241 .Ldone: 242 str $rounds,[$out] 243 mov $ptr,#0 244 245 .Lenc_key_abort: 246 mov x0,$ptr // return value 247 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 248 ret 249 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 250 251 .globl ${prefix}_set_decrypt_key 252 .type ${prefix}_set_decrypt_key,%function 253 .align 5 254 ${prefix}_set_decrypt_key: 255 ___ 256 $code.=<<___ if ($flavour =~ /64/); 257 stp x29,x30,[sp,#-16]! 258 add x29,sp,#0 259 ___ 260 $code.=<<___ if ($flavour !~ /64/); 261 stmdb sp!,{r4,lr} 262 ___ 263 $code.=<<___; 264 bl .Lenc_key 265 266 cmp x0,#0 267 b.ne .Ldec_key_abort 268 269 sub $out,$out,#240 // restore original $out 270 mov x4,#-16 271 add $inp,$out,x12,lsl#4 // end of key schedule 272 273 vld1.32 {v0.16b},[$out] 274 vld1.32 {v1.16b},[$inp] 275 vst1.32 {v0.16b},[$inp],x4 276 vst1.32 {v1.16b},[$out],#16 277 278 .Loop_imc: 279 vld1.32 {v0.16b},[$out] 280 vld1.32 {v1.16b},[$inp] 281 aesimc v0.16b,v0.16b 282 aesimc v1.16b,v1.16b 283 vst1.32 {v0.16b},[$inp],x4 284 vst1.32 {v1.16b},[$out],#16 285 cmp $inp,$out 286 b.hi .Loop_imc 287 288 vld1.32 {v0.16b},[$out] 289 aesimc v0.16b,v0.16b 290 vst1.32 {v0.16b},[$inp] 291 292 eor x0,x0,x0 // return value 293 .Ldec_key_abort: 294 ___ 295 $code.=<<___ if ($flavour !~ /64/); 296 ldmia sp!,{r4,pc} 297 ___ 298 $code.=<<___ if ($flavour =~ /64/); 299 ldp x29,x30,[sp],#16 300 ret 301 ___ 302 $code.=<<___; 303 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 304 ___ 305 }}} 306 {{{ 307 sub gen_block () { 308 my $dir = shift; 309 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 310 my ($inp,$out,$key)=map("x$_",(0..2)); 311 my $rounds="w3"; 312 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 313 314 $code.=<<___; 315 .globl ${prefix}_${dir}crypt 316 .type ${prefix}_${dir}crypt,%function 317 .align 5 318 ${prefix}_${dir}crypt: 319 ldr $rounds,[$key,#240] 320 vld1.32 {$rndkey0},[$key],#16 321 vld1.8 {$inout},[$inp] 322 sub $rounds,$rounds,#2 323 vld1.32 {$rndkey1},[$key],#16 324 325 .Loop_${dir}c: 326 aes$e $inout,$rndkey0 327 aes$mc $inout,$inout 328 vld1.32 {$rndkey0},[$key],#16 329 subs $rounds,$rounds,#2 330 aes$e $inout,$rndkey1 331 aes$mc $inout,$inout 332 vld1.32 {$rndkey1},[$key],#16 333 b.gt .Loop_${dir}c 334 335 aes$e $inout,$rndkey0 336 aes$mc $inout,$inout 337 vld1.32 {$rndkey0},[$key] 338 aes$e $inout,$rndkey1 339 veor $inout,$inout,$rndkey0 340 341 vst1.8 {$inout},[$out] 342 ret 343 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 344 ___ 345 } 346 &gen_block("en"); 347 &gen_block("de"); 348 }}} 349 {{{ 350 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 351 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 352 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 353 354 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 355 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 356 357 ### q8-q15 preloaded key schedule 358 359 $code.=<<___; 360 .globl ${prefix}_cbc_encrypt 361 .type ${prefix}_cbc_encrypt,%function 362 .align 5 363 ${prefix}_cbc_encrypt: 364 ___ 365 $code.=<<___ if ($flavour =~ /64/); 366 stp x29,x30,[sp,#-16]! 367 add x29,sp,#0 368 ___ 369 $code.=<<___ if ($flavour !~ /64/); 370 mov ip,sp 371 stmdb sp!,{r4-r8,lr} 372 vstmdb sp!,{d8-d15} @ ABI specification says so 373 ldmia ip,{r4-r5} @ load remaining args 374 ___ 375 $code.=<<___; 376 subs $len,$len,#16 377 mov $step,#16 378 b.lo .Lcbc_abort 379 cclr $step,eq 380 381 cmp $enc,#0 // en- or decrypting? 382 ldr $rounds,[$key,#240] 383 and $len,$len,#-16 384 vld1.8 {$ivec},[$ivp] 385 vld1.8 {$dat},[$inp],$step 386 387 vld1.32 {q8-q9},[$key] // load key schedule... 388 sub $rounds,$rounds,#6 389 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 390 sub $rounds,$rounds,#2 391 vld1.32 {q10-q11},[$key_],#32 392 vld1.32 {q12-q13},[$key_],#32 393 vld1.32 {q14-q15},[$key_],#32 394 vld1.32 {$rndlast},[$key_] 395 396 add $key_,$key,#32 397 mov $cnt,$rounds 398 b.eq .Lcbc_dec 399 400 cmp $rounds,#2 401 veor $dat,$dat,$ivec 402 veor $rndzero_n_last,q8,$rndlast 403 b.eq .Lcbc_enc128 404 405 vld1.32 {$in0-$in1},[$key_] 406 add $key_,$key,#16 407 add $key4,$key,#16*4 408 add $key5,$key,#16*5 409 aese $dat,q8 410 aesmc $dat,$dat 411 add $key6,$key,#16*6 412 add $key7,$key,#16*7 413 b .Lenter_cbc_enc 414 415 .align 4 416 .Loop_cbc_enc: 417 aese $dat,q8 418 aesmc $dat,$dat 419 vst1.8 {$ivec},[$out],#16 420 .Lenter_cbc_enc: 421 aese $dat,q9 422 aesmc $dat,$dat 423 aese $dat,$in0 424 aesmc $dat,$dat 425 vld1.32 {q8},[$key4] 426 cmp $rounds,#4 427 aese $dat,$in1 428 aesmc $dat,$dat 429 vld1.32 {q9},[$key5] 430 b.eq .Lcbc_enc192 431 432 aese $dat,q8 433 aesmc $dat,$dat 434 vld1.32 {q8},[$key6] 435 aese $dat,q9 436 aesmc $dat,$dat 437 vld1.32 {q9},[$key7] 438 nop 439 440 .Lcbc_enc192: 441 aese $dat,q8 442 aesmc $dat,$dat 443 subs $len,$len,#16 444 aese $dat,q9 445 aesmc $dat,$dat 446 cclr $step,eq 447 aese $dat,q10 448 aesmc $dat,$dat 449 aese $dat,q11 450 aesmc $dat,$dat 451 vld1.8 {q8},[$inp],$step 452 aese $dat,q12 453 aesmc $dat,$dat 454 veor q8,q8,$rndzero_n_last 455 aese $dat,q13 456 aesmc $dat,$dat 457 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 458 aese $dat,q14 459 aesmc $dat,$dat 460 aese $dat,q15 461 veor $ivec,$dat,$rndlast 462 b.hs .Loop_cbc_enc 463 464 vst1.8 {$ivec},[$out],#16 465 b .Lcbc_done 466 467 .align 5 468 .Lcbc_enc128: 469 vld1.32 {$in0-$in1},[$key_] 470 aese $dat,q8 471 aesmc $dat,$dat 472 b .Lenter_cbc_enc128 473 .Loop_cbc_enc128: 474 aese $dat,q8 475 aesmc $dat,$dat 476 vst1.8 {$ivec},[$out],#16 477 .Lenter_cbc_enc128: 478 aese $dat,q9 479 aesmc $dat,$dat 480 subs $len,$len,#16 481 aese $dat,$in0 482 aesmc $dat,$dat 483 cclr $step,eq 484 aese $dat,$in1 485 aesmc $dat,$dat 486 aese $dat,q10 487 aesmc $dat,$dat 488 aese $dat,q11 489 aesmc $dat,$dat 490 vld1.8 {q8},[$inp],$step 491 aese $dat,q12 492 aesmc $dat,$dat 493 aese $dat,q13 494 aesmc $dat,$dat 495 aese $dat,q14 496 aesmc $dat,$dat 497 veor q8,q8,$rndzero_n_last 498 aese $dat,q15 499 veor $ivec,$dat,$rndlast 500 b.hs .Loop_cbc_enc128 501 502 vst1.8 {$ivec},[$out],#16 503 b .Lcbc_done 504 ___ 505 { 506 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 507 $code.=<<___; 508 .align 5 509 .Lcbc_dec: 510 vld1.8 {$dat2},[$inp],#16 511 subs $len,$len,#32 // bias 512 add $cnt,$rounds,#2 513 vorr $in1,$dat,$dat 514 vorr $dat1,$dat,$dat 515 vorr $in2,$dat2,$dat2 516 b.lo .Lcbc_dec_tail 517 518 vorr $dat1,$dat2,$dat2 519 vld1.8 {$dat2},[$inp],#16 520 vorr $in0,$dat,$dat 521 vorr $in1,$dat1,$dat1 522 vorr $in2,$dat2,$dat2 523 524 .Loop3x_cbc_dec: 525 aesd $dat0,q8 526 aesimc $dat0,$dat0 527 aesd $dat1,q8 528 aesimc $dat1,$dat1 529 aesd $dat2,q8 530 aesimc $dat2,$dat2 531 vld1.32 {q8},[$key_],#16 532 subs $cnt,$cnt,#2 533 aesd $dat0,q9 534 aesimc $dat0,$dat0 535 aesd $dat1,q9 536 aesimc $dat1,$dat1 537 aesd $dat2,q9 538 aesimc $dat2,$dat2 539 vld1.32 {q9},[$key_],#16 540 b.gt .Loop3x_cbc_dec 541 542 aesd $dat0,q8 543 aesimc $dat0,$dat0 544 aesd $dat1,q8 545 aesimc $dat1,$dat1 546 aesd $dat2,q8 547 aesimc $dat2,$dat2 548 veor $tmp0,$ivec,$rndlast 549 subs $len,$len,#0x30 550 veor $tmp1,$in0,$rndlast 551 mov.lo x6,$len // x6, $cnt, is zero at this point 552 aesd $dat0,q9 553 aesimc $dat0,$dat0 554 aesd $dat1,q9 555 aesimc $dat1,$dat1 556 aesd $dat2,q9 557 aesimc $dat2,$dat2 558 veor $tmp2,$in1,$rndlast 559 add $inp,$inp,x6 // $inp is adjusted in such way that 560 // at exit from the loop $dat1-$dat2 561 // are loaded with last "words" 562 vorr $ivec,$in2,$in2 563 mov $key_,$key 564 aesd $dat0,q12 565 aesimc $dat0,$dat0 566 aesd $dat1,q12 567 aesimc $dat1,$dat1 568 aesd $dat2,q12 569 aesimc $dat2,$dat2 570 vld1.8 {$in0},[$inp],#16 571 aesd $dat0,q13 572 aesimc $dat0,$dat0 573 aesd $dat1,q13 574 aesimc $dat1,$dat1 575 aesd $dat2,q13 576 aesimc $dat2,$dat2 577 vld1.8 {$in1},[$inp],#16 578 aesd $dat0,q14 579 aesimc $dat0,$dat0 580 aesd $dat1,q14 581 aesimc $dat1,$dat1 582 aesd $dat2,q14 583 aesimc $dat2,$dat2 584 vld1.8 {$in2},[$inp],#16 585 aesd $dat0,q15 586 aesd $dat1,q15 587 aesd $dat2,q15 588 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 589 add $cnt,$rounds,#2 590 veor $tmp0,$tmp0,$dat0 591 veor $tmp1,$tmp1,$dat1 592 veor $dat2,$dat2,$tmp2 593 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 594 vst1.8 {$tmp0},[$out],#16 595 vorr $dat0,$in0,$in0 596 vst1.8 {$tmp1},[$out],#16 597 vorr $dat1,$in1,$in1 598 vst1.8 {$dat2},[$out],#16 599 vorr $dat2,$in2,$in2 600 b.hs .Loop3x_cbc_dec 601 602 cmn $len,#0x30 603 b.eq .Lcbc_done 604 nop 605 606 .Lcbc_dec_tail: 607 aesd $dat1,q8 608 aesimc $dat1,$dat1 609 aesd $dat2,q8 610 aesimc $dat2,$dat2 611 vld1.32 {q8},[$key_],#16 612 subs $cnt,$cnt,#2 613 aesd $dat1,q9 614 aesimc $dat1,$dat1 615 aesd $dat2,q9 616 aesimc $dat2,$dat2 617 vld1.32 {q9},[$key_],#16 618 b.gt .Lcbc_dec_tail 619 620 aesd $dat1,q8 621 aesimc $dat1,$dat1 622 aesd $dat2,q8 623 aesimc $dat2,$dat2 624 aesd $dat1,q9 625 aesimc $dat1,$dat1 626 aesd $dat2,q9 627 aesimc $dat2,$dat2 628 aesd $dat1,q12 629 aesimc $dat1,$dat1 630 aesd $dat2,q12 631 aesimc $dat2,$dat2 632 cmn $len,#0x20 633 aesd $dat1,q13 634 aesimc $dat1,$dat1 635 aesd $dat2,q13 636 aesimc $dat2,$dat2 637 veor $tmp1,$ivec,$rndlast 638 aesd $dat1,q14 639 aesimc $dat1,$dat1 640 aesd $dat2,q14 641 aesimc $dat2,$dat2 642 veor $tmp2,$in1,$rndlast 643 aesd $dat1,q15 644 aesd $dat2,q15 645 b.eq .Lcbc_dec_one 646 veor $tmp1,$tmp1,$dat1 647 veor $tmp2,$tmp2,$dat2 648 vorr $ivec,$in2,$in2 649 vst1.8 {$tmp1},[$out],#16 650 vst1.8 {$tmp2},[$out],#16 651 b .Lcbc_done 652 653 .Lcbc_dec_one: 654 veor $tmp1,$tmp1,$dat2 655 vorr $ivec,$in2,$in2 656 vst1.8 {$tmp1},[$out],#16 657 658 .Lcbc_done: 659 vst1.8 {$ivec},[$ivp] 660 .Lcbc_abort: 661 ___ 662 } 663 $code.=<<___ if ($flavour !~ /64/); 664 vldmia sp!,{d8-d15} 665 ldmia sp!,{r4-r8,pc} 666 ___ 667 $code.=<<___ if ($flavour =~ /64/); 668 ldr x29,[sp],#16 669 ret 670 ___ 671 $code.=<<___; 672 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 673 ___ 674 }}} 675 {{{ 676 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 677 my ($rounds,$cnt,$key_)=("w5","w6","x7"); 678 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 679 my $step="x12"; # aliases with $tctr2 680 681 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 682 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 683 684 my ($dat,$tmp)=($dat0,$tmp0); 685 686 ### q8-q15 preloaded key schedule 687 688 $code.=<<___; 689 .globl ${prefix}_ctr32_encrypt_blocks 690 .type ${prefix}_ctr32_encrypt_blocks,%function 691 .align 5 692 ${prefix}_ctr32_encrypt_blocks: 693 ___ 694 $code.=<<___ if ($flavour =~ /64/); 695 stp x29,x30,[sp,#-16]! 696 add x29,sp,#0 697 ___ 698 $code.=<<___ if ($flavour !~ /64/); 699 mov ip,sp 700 stmdb sp!,{r4-r10,lr} 701 vstmdb sp!,{d8-d15} @ ABI specification says so 702 ldr r4, [ip] @ load remaining arg 703 ___ 704 $code.=<<___; 705 ldr $rounds,[$key,#240] 706 707 ldr $ctr, [$ivp, #12] 708 vld1.32 {$dat0},[$ivp] 709 710 vld1.32 {q8-q9},[$key] // load key schedule... 711 sub $rounds,$rounds,#4 712 mov $step,#16 713 cmp $len,#2 714 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 715 sub $rounds,$rounds,#2 716 vld1.32 {q12-q13},[$key_],#32 717 vld1.32 {q14-q15},[$key_],#32 718 vld1.32 {$rndlast},[$key_] 719 add $key_,$key,#32 720 mov $cnt,$rounds 721 cclr $step,lo 722 #ifndef __ARMEB__ 723 rev $ctr, $ctr 724 #endif 725 vorr $dat1,$dat0,$dat0 726 add $tctr1, $ctr, #1 727 vorr $dat2,$dat0,$dat0 728 add $ctr, $ctr, #2 729 vorr $ivec,$dat0,$dat0 730 rev $tctr1, $tctr1 731 vmov.32 ${dat1}[3],$tctr1 732 b.ls .Lctr32_tail 733 rev $tctr2, $ctr 734 sub $len,$len,#3 // bias 735 vmov.32 ${dat2}[3],$tctr2 736 b .Loop3x_ctr32 737 738 .align 4 739 .Loop3x_ctr32: 740 aese $dat0,q8 741 aesmc $dat0,$dat0 742 aese $dat1,q8 743 aesmc $dat1,$dat1 744 aese $dat2,q8 745 aesmc $dat2,$dat2 746 vld1.32 {q8},[$key_],#16 747 subs $cnt,$cnt,#2 748 aese $dat0,q9 749 aesmc $dat0,$dat0 750 aese $dat1,q9 751 aesmc $dat1,$dat1 752 aese $dat2,q9 753 aesmc $dat2,$dat2 754 vld1.32 {q9},[$key_],#16 755 b.gt .Loop3x_ctr32 756 757 aese $dat0,q8 758 aesmc $tmp0,$dat0 759 aese $dat1,q8 760 aesmc $tmp1,$dat1 761 vld1.8 {$in0},[$inp],#16 762 vorr $dat0,$ivec,$ivec 763 aese $dat2,q8 764 aesmc $dat2,$dat2 765 vld1.8 {$in1},[$inp],#16 766 vorr $dat1,$ivec,$ivec 767 aese $tmp0,q9 768 aesmc $tmp0,$tmp0 769 aese $tmp1,q9 770 aesmc $tmp1,$tmp1 771 vld1.8 {$in2},[$inp],#16 772 mov $key_,$key 773 aese $dat2,q9 774 aesmc $tmp2,$dat2 775 vorr $dat2,$ivec,$ivec 776 add $tctr0,$ctr,#1 777 aese $tmp0,q12 778 aesmc $tmp0,$tmp0 779 aese $tmp1,q12 780 aesmc $tmp1,$tmp1 781 veor $in0,$in0,$rndlast 782 add $tctr1,$ctr,#2 783 aese $tmp2,q12 784 aesmc $tmp2,$tmp2 785 veor $in1,$in1,$rndlast 786 add $ctr,$ctr,#3 787 aese $tmp0,q13 788 aesmc $tmp0,$tmp0 789 aese $tmp1,q13 790 aesmc $tmp1,$tmp1 791 veor $in2,$in2,$rndlast 792 rev $tctr0,$tctr0 793 aese $tmp2,q13 794 aesmc $tmp2,$tmp2 795 vmov.32 ${dat0}[3], $tctr0 796 rev $tctr1,$tctr1 797 aese $tmp0,q14 798 aesmc $tmp0,$tmp0 799 aese $tmp1,q14 800 aesmc $tmp1,$tmp1 801 vmov.32 ${dat1}[3], $tctr1 802 rev $tctr2,$ctr 803 aese $tmp2,q14 804 aesmc $tmp2,$tmp2 805 vmov.32 ${dat2}[3], $tctr2 806 subs $len,$len,#3 807 aese $tmp0,q15 808 aese $tmp1,q15 809 aese $tmp2,q15 810 811 veor $in0,$in0,$tmp0 812 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 813 vst1.8 {$in0},[$out],#16 814 veor $in1,$in1,$tmp1 815 mov $cnt,$rounds 816 vst1.8 {$in1},[$out],#16 817 veor $in2,$in2,$tmp2 818 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 819 vst1.8 {$in2},[$out],#16 820 b.hs .Loop3x_ctr32 821 822 adds $len,$len,#3 823 b.eq .Lctr32_done 824 cmp $len,#1 825 mov $step,#16 826 cclr $step,eq 827 828 .Lctr32_tail: 829 aese $dat0,q8 830 aesmc $dat0,$dat0 831 aese $dat1,q8 832 aesmc $dat1,$dat1 833 vld1.32 {q8},[$key_],#16 834 subs $cnt,$cnt,#2 835 aese $dat0,q9 836 aesmc $dat0,$dat0 837 aese $dat1,q9 838 aesmc $dat1,$dat1 839 vld1.32 {q9},[$key_],#16 840 b.gt .Lctr32_tail 841 842 aese $dat0,q8 843 aesmc $dat0,$dat0 844 aese $dat1,q8 845 aesmc $dat1,$dat1 846 aese $dat0,q9 847 aesmc $dat0,$dat0 848 aese $dat1,q9 849 aesmc $dat1,$dat1 850 vld1.8 {$in0},[$inp],$step 851 aese $dat0,q12 852 aesmc $dat0,$dat0 853 aese $dat1,q12 854 aesmc $dat1,$dat1 855 vld1.8 {$in1},[$inp] 856 aese $dat0,q13 857 aesmc $dat0,$dat0 858 aese $dat1,q13 859 aesmc $dat1,$dat1 860 veor $in0,$in0,$rndlast 861 aese $dat0,q14 862 aesmc $dat0,$dat0 863 aese $dat1,q14 864 aesmc $dat1,$dat1 865 veor $in1,$in1,$rndlast 866 aese $dat0,q15 867 aese $dat1,q15 868 869 cmp $len,#1 870 veor $in0,$in0,$dat0 871 veor $in1,$in1,$dat1 872 vst1.8 {$in0},[$out],#16 873 b.eq .Lctr32_done 874 vst1.8 {$in1},[$out] 875 876 .Lctr32_done: 877 ___ 878 $code.=<<___ if ($flavour !~ /64/); 879 vldmia sp!,{d8-d15} 880 ldmia sp!,{r4-r10,pc} 881 ___ 882 $code.=<<___ if ($flavour =~ /64/); 883 ldr x29,[sp],#16 884 ret 885 ___ 886 $code.=<<___; 887 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 888 ___ 889 }}} 890 $code.=<<___; 891 #endif 892 ___ 893 ######################################## 894 if ($flavour =~ /64/) { ######## 64-bit code 895 my %opcode = ( 896 "aesd" => 0x4e285800, "aese" => 0x4e284800, 897 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 898 899 local *unaes = sub { 900 my ($mnemonic,$arg)=@_; 901 902 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 903 sprintf ".inst\t0x%08x\t//%s %s", 904 $opcode{$mnemonic}|$1|($2<<5), 905 $mnemonic,$arg; 906 }; 907 908 foreach(split("\n",$code)) { 909 s/\`([^\`]*)\`/eval($1)/geo; 910 911 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 912 s/@\s/\/\//o; # old->new style commentary 913 914 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 915 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 916 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 917 s/vmov\.i8/movi/o or # fix up legacy mnemonics 918 s/vext\.8/ext/o or 919 s/vrev32\.8/rev32/o or 920 s/vtst\.8/cmtst/o or 921 s/vshr/ushr/o or 922 s/^(\s+)v/$1/o or # strip off v prefix 923 s/\bbx\s+lr\b/ret/o; 924 925 # fix up remainig legacy suffixes 926 s/\.[ui]?8//o; 927 m/\],#8/o and s/\.16b/\.8b/go; 928 s/\.[ui]?32//o and s/\.16b/\.4s/go; 929 s/\.[ui]?64//o and s/\.16b/\.2d/go; 930 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 931 932 print $_,"\n"; 933 } 934 } else { ######## 32-bit code 935 my %opcode = ( 936 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 937 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 938 939 local *unaes = sub { 940 my ($mnemonic,$arg)=@_; 941 942 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 943 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 944 |(($2&7)<<1) |(($2&8)<<2); 945 # since ARMv7 instructions are always encoded little-endian. 946 # correct solution is to use .inst directive, but older 947 # assemblers don't implement it:-( 948 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 949 $word&0xff,($word>>8)&0xff, 950 ($word>>16)&0xff,($word>>24)&0xff, 951 $mnemonic,$arg; 952 } 953 }; 954 955 sub unvtbl { 956 my $arg=shift; 957 958 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 959 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 960 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 961 } 962 963 sub unvdup32 { 964 my $arg=shift; 965 966 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 967 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 968 } 969 970 sub unvmov32 { 971 my $arg=shift; 972 973 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 974 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 975 } 976 977 foreach(split("\n",$code)) { 978 s/\`([^\`]*)\`/eval($1)/geo; 979 980 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 981 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 982 s/\/\/\s?/@ /o; # new->old style commentary 983 984 # fix up remainig new-style suffixes 985 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 986 s/\],#[0-9]+/]!/o; 987 988 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 989 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 990 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 991 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 992 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 993 s/^(\s+)b\./$1b/o or 994 s/^(\s+)mov\./$1mov/o or 995 s/^(\s+)ret/$1bx\tlr/o; 996 997 print $_,"\n"; 998 } 999 } 1000 1001 close STDOUT; 1002