Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # This module implements support for ARMv8 AES instructions. The
     11 # module is endian-agnostic in sense that it supports both big- and
     12 # little-endian cases. As does it support both 32- and 64-bit modes
     13 # of operation. Latter is achieved by limiting amount of utilized
     14 # registers to 16, which implies additional NEON load and integer
     15 # instructions. This has no effect on mighty Apple A7, where results
     16 # are literally equal to the theoretical estimates based on AES
     17 # instruction latencies and issue rates. On Cortex-A53, an in-order
     18 # execution core, this costs up to 10-15%, which is partially
     19 # compensated by implementing dedicated code path for 128-bit
     20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
     21 # seems to be limited by sheer amount of NEON instructions...
     22 #
     23 # Performance in cycles per byte processed with 128-bit key:
     24 #
     25 #		CBC enc		CBC dec		CTR
     26 # Apple A7	2.39		1.20		1.20
     27 # Cortex-A53	1.32		1.29		1.46
     28 # Cortex-A57(*)	1.95		0.85		0.93
     29 # Denver	1.96		0.86		0.80
     30 #
     31 # (*)	original 3.64/1.34/1.32 results were for r0p0 revision
     32 #	and are still same even for updated module;
     33 
     34 $flavour = shift;
     35 $output  = shift;
     36 
     37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     38 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     40 die "can't locate arm-xlate.pl";
     41 
     42 open OUT,"| \"$^X\" $xlate $flavour $output";
     43 *STDOUT=*OUT;
     44 
     45 $prefix="aes_v8";
     46 
     47 $code=<<___;
     48 #include "arm_arch.h"
     49 
     50 #if __ARM_MAX_ARCH__>=7
     51 .text
     52 ___
     53 $code.=<<___ if ($flavour =~ /64/);
     54 #if !defined(__clang__)
     55 .arch  armv8-a+crypto
     56 #endif
     57 ___
     58 $code.=".arch	armv7-a\n.fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
     59 		#^^^^^^ this is done to simplify adoption by not depending
     60 		#	on latest binutils.
     61 
     62 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
     63 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
     64 # maintain both 32- and 64-bit codes within single module and
     65 # transliterate common code to either flavour with regex vodoo.
     66 #
     67 {{{
     68 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
     69 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
     70 	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
     71 
     72 
     73 $code.=<<___;
     74 .align	5
     75 .Lrcon:
     76 .long	0x01,0x01,0x01,0x01
     77 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
     78 .long	0x1b,0x1b,0x1b,0x1b
     79 
     80 .globl	${prefix}_set_encrypt_key
     81 .type	${prefix}_set_encrypt_key,%function
     82 .align	5
     83 ${prefix}_set_encrypt_key:
     84 .Lenc_key:
     85 ___
     86 $code.=<<___	if ($flavour =~ /64/);
     87 	stp	x29,x30,[sp,#-16]!
     88 	add	x29,sp,#0
     89 ___
     90 $code.=<<___;
     91 	mov	$ptr,#-1
     92 	cmp	$inp,#0
     93 	b.eq	.Lenc_key_abort
     94 	cmp	$out,#0
     95 	b.eq	.Lenc_key_abort
     96 	mov	$ptr,#-2
     97 	cmp	$bits,#128
     98 	b.lt	.Lenc_key_abort
     99 	cmp	$bits,#256
    100 	b.gt	.Lenc_key_abort
    101 	tst	$bits,#0x3f
    102 	b.ne	.Lenc_key_abort
    103 
    104 	adr	$ptr,.Lrcon
    105 	cmp	$bits,#192
    106 
    107 	veor	$zero,$zero,$zero
    108 	vld1.8	{$in0},[$inp],#16
    109 	mov	$bits,#8		// reuse $bits
    110 	vld1.32	{$rcon,$mask},[$ptr],#32
    111 
    112 	b.lt	.Loop128
    113 	b.eq	.L192
    114 	b	.L256
    115 
    116 .align	4
    117 .Loop128:
    118 	vtbl.8	$key,{$in0},$mask
    119 	vext.8	$tmp,$zero,$in0,#12
    120 	vst1.32	{$in0},[$out],#16
    121 	aese	$key,$zero
    122 	subs	$bits,$bits,#1
    123 
    124 	veor	$in0,$in0,$tmp
    125 	vext.8	$tmp,$zero,$tmp,#12
    126 	veor	$in0,$in0,$tmp
    127 	vext.8	$tmp,$zero,$tmp,#12
    128 	 veor	$key,$key,$rcon
    129 	veor	$in0,$in0,$tmp
    130 	vshl.u8	$rcon,$rcon,#1
    131 	veor	$in0,$in0,$key
    132 	b.ne	.Loop128
    133 
    134 	vld1.32	{$rcon},[$ptr]
    135 
    136 	vtbl.8	$key,{$in0},$mask
    137 	vext.8	$tmp,$zero,$in0,#12
    138 	vst1.32	{$in0},[$out],#16
    139 	aese	$key,$zero
    140 
    141 	veor	$in0,$in0,$tmp
    142 	vext.8	$tmp,$zero,$tmp,#12
    143 	veor	$in0,$in0,$tmp
    144 	vext.8	$tmp,$zero,$tmp,#12
    145 	 veor	$key,$key,$rcon
    146 	veor	$in0,$in0,$tmp
    147 	vshl.u8	$rcon,$rcon,#1
    148 	veor	$in0,$in0,$key
    149 
    150 	vtbl.8	$key,{$in0},$mask
    151 	vext.8	$tmp,$zero,$in0,#12
    152 	vst1.32	{$in0},[$out],#16
    153 	aese	$key,$zero
    154 
    155 	veor	$in0,$in0,$tmp
    156 	vext.8	$tmp,$zero,$tmp,#12
    157 	veor	$in0,$in0,$tmp
    158 	vext.8	$tmp,$zero,$tmp,#12
    159 	 veor	$key,$key,$rcon
    160 	veor	$in0,$in0,$tmp
    161 	veor	$in0,$in0,$key
    162 	vst1.32	{$in0},[$out]
    163 	add	$out,$out,#0x50
    164 
    165 	mov	$rounds,#10
    166 	b	.Ldone
    167 
    168 .align	4
    169 .L192:
    170 	vld1.8	{$in1},[$inp],#8
    171 	vmov.i8	$key,#8			// borrow $key
    172 	vst1.32	{$in0},[$out],#16
    173 	vsub.i8	$mask,$mask,$key	// adjust the mask
    174 
    175 .Loop192:
    176 	vtbl.8	$key,{$in1},$mask
    177 	vext.8	$tmp,$zero,$in0,#12
    178 	vst1.32	{$in1},[$out],#8
    179 	aese	$key,$zero
    180 	subs	$bits,$bits,#1
    181 
    182 	veor	$in0,$in0,$tmp
    183 	vext.8	$tmp,$zero,$tmp,#12
    184 	veor	$in0,$in0,$tmp
    185 	vext.8	$tmp,$zero,$tmp,#12
    186 	veor	$in0,$in0,$tmp
    187 
    188 	vdup.32	$tmp,${in0}[3]
    189 	veor	$tmp,$tmp,$in1
    190 	 veor	$key,$key,$rcon
    191 	vext.8	$in1,$zero,$in1,#12
    192 	vshl.u8	$rcon,$rcon,#1
    193 	veor	$in1,$in1,$tmp
    194 	veor	$in0,$in0,$key
    195 	veor	$in1,$in1,$key
    196 	vst1.32	{$in0},[$out],#16
    197 	b.ne	.Loop192
    198 
    199 	mov	$rounds,#12
    200 	add	$out,$out,#0x20
    201 	b	.Ldone
    202 
    203 .align	4
    204 .L256:
    205 	vld1.8	{$in1},[$inp]
    206 	mov	$bits,#7
    207 	mov	$rounds,#14
    208 	vst1.32	{$in0},[$out],#16
    209 
    210 .Loop256:
    211 	vtbl.8	$key,{$in1},$mask
    212 	vext.8	$tmp,$zero,$in0,#12
    213 	vst1.32	{$in1},[$out],#16
    214 	aese	$key,$zero
    215 	subs	$bits,$bits,#1
    216 
    217 	veor	$in0,$in0,$tmp
    218 	vext.8	$tmp,$zero,$tmp,#12
    219 	veor	$in0,$in0,$tmp
    220 	vext.8	$tmp,$zero,$tmp,#12
    221 	 veor	$key,$key,$rcon
    222 	veor	$in0,$in0,$tmp
    223 	vshl.u8	$rcon,$rcon,#1
    224 	veor	$in0,$in0,$key
    225 	vst1.32	{$in0},[$out],#16
    226 	b.eq	.Ldone
    227 
    228 	vdup.32	$key,${in0}[3]		// just splat
    229 	vext.8	$tmp,$zero,$in1,#12
    230 	aese	$key,$zero
    231 
    232 	veor	$in1,$in1,$tmp
    233 	vext.8	$tmp,$zero,$tmp,#12
    234 	veor	$in1,$in1,$tmp
    235 	vext.8	$tmp,$zero,$tmp,#12
    236 	veor	$in1,$in1,$tmp
    237 
    238 	veor	$in1,$in1,$key
    239 	b	.Loop256
    240 
    241 .Ldone:
    242 	str	$rounds,[$out]
    243 	mov	$ptr,#0
    244 
    245 .Lenc_key_abort:
    246 	mov	x0,$ptr			// return value
    247 	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
    248 	ret
    249 .size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
    250 
    251 .globl	${prefix}_set_decrypt_key
    252 .type	${prefix}_set_decrypt_key,%function
    253 .align	5
    254 ${prefix}_set_decrypt_key:
    255 ___
    256 $code.=<<___	if ($flavour =~ /64/);
    257 	stp	x29,x30,[sp,#-16]!
    258 	add	x29,sp,#0
    259 ___
    260 $code.=<<___	if ($flavour !~ /64/);
    261 	stmdb	sp!,{r4,lr}
    262 ___
    263 $code.=<<___;
    264 	bl	.Lenc_key
    265 
    266 	cmp	x0,#0
    267 	b.ne	.Ldec_key_abort
    268 
    269 	sub	$out,$out,#240		// restore original $out
    270 	mov	x4,#-16
    271 	add	$inp,$out,x12,lsl#4	// end of key schedule
    272 
    273 	vld1.32	{v0.16b},[$out]
    274 	vld1.32	{v1.16b},[$inp]
    275 	vst1.32	{v0.16b},[$inp],x4
    276 	vst1.32	{v1.16b},[$out],#16
    277 
    278 .Loop_imc:
    279 	vld1.32	{v0.16b},[$out]
    280 	vld1.32	{v1.16b},[$inp]
    281 	aesimc	v0.16b,v0.16b
    282 	aesimc	v1.16b,v1.16b
    283 	vst1.32	{v0.16b},[$inp],x4
    284 	vst1.32	{v1.16b},[$out],#16
    285 	cmp	$inp,$out
    286 	b.hi	.Loop_imc
    287 
    288 	vld1.32	{v0.16b},[$out]
    289 	aesimc	v0.16b,v0.16b
    290 	vst1.32	{v0.16b},[$inp]
    291 
    292 	eor	x0,x0,x0		// return value
    293 .Ldec_key_abort:
    294 ___
    295 $code.=<<___	if ($flavour !~ /64/);
    296 	ldmia	sp!,{r4,pc}
    297 ___
    298 $code.=<<___	if ($flavour =~ /64/);
    299 	ldp	x29,x30,[sp],#16
    300 	ret
    301 ___
    302 $code.=<<___;
    303 .size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
    304 ___
    305 }}}
    306 {{{
    307 sub gen_block () {
    308 my $dir = shift;
    309 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
    310 my ($inp,$out,$key)=map("x$_",(0..2));
    311 my $rounds="w3";
    312 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
    313 
    314 $code.=<<___;
    315 .globl	${prefix}_${dir}crypt
    316 .type	${prefix}_${dir}crypt,%function
    317 .align	5
    318 ${prefix}_${dir}crypt:
    319 	ldr	$rounds,[$key,#240]
    320 	vld1.32	{$rndkey0},[$key],#16
    321 	vld1.8	{$inout},[$inp]
    322 	sub	$rounds,$rounds,#2
    323 	vld1.32	{$rndkey1},[$key],#16
    324 
    325 .Loop_${dir}c:
    326 	aes$e	$inout,$rndkey0
    327 	aes$mc	$inout,$inout
    328 	vld1.32	{$rndkey0},[$key],#16
    329 	subs	$rounds,$rounds,#2
    330 	aes$e	$inout,$rndkey1
    331 	aes$mc	$inout,$inout
    332 	vld1.32	{$rndkey1},[$key],#16
    333 	b.gt	.Loop_${dir}c
    334 
    335 	aes$e	$inout,$rndkey0
    336 	aes$mc	$inout,$inout
    337 	vld1.32	{$rndkey0},[$key]
    338 	aes$e	$inout,$rndkey1
    339 	veor	$inout,$inout,$rndkey0
    340 
    341 	vst1.8	{$inout},[$out]
    342 	ret
    343 .size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
    344 ___
    345 }
    346 &gen_block("en");
    347 &gen_block("de");
    348 }}}
    349 {{{
    350 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
    351 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
    352 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
    353 
    354 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
    355 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
    356 
    357 ### q8-q15	preloaded key schedule
    358 
    359 $code.=<<___;
    360 .globl	${prefix}_cbc_encrypt
    361 .type	${prefix}_cbc_encrypt,%function
    362 .align	5
    363 ${prefix}_cbc_encrypt:
    364 ___
    365 $code.=<<___	if ($flavour =~ /64/);
    366 	stp	x29,x30,[sp,#-16]!
    367 	add	x29,sp,#0
    368 ___
    369 $code.=<<___	if ($flavour !~ /64/);
    370 	mov	ip,sp
    371 	stmdb	sp!,{r4-r8,lr}
    372 	vstmdb	sp!,{d8-d15}            @ ABI specification says so
    373 	ldmia	ip,{r4-r5}		@ load remaining args
    374 ___
    375 $code.=<<___;
    376 	subs	$len,$len,#16
    377 	mov	$step,#16
    378 	b.lo	.Lcbc_abort
    379 	cclr	$step,eq
    380 
    381 	cmp	$enc,#0			// en- or decrypting?
    382 	ldr	$rounds,[$key,#240]
    383 	and	$len,$len,#-16
    384 	vld1.8	{$ivec},[$ivp]
    385 	vld1.8	{$dat},[$inp],$step
    386 
    387 	vld1.32	{q8-q9},[$key]		// load key schedule...
    388 	sub	$rounds,$rounds,#6
    389 	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
    390 	sub	$rounds,$rounds,#2
    391 	vld1.32	{q10-q11},[$key_],#32
    392 	vld1.32	{q12-q13},[$key_],#32
    393 	vld1.32	{q14-q15},[$key_],#32
    394 	vld1.32	{$rndlast},[$key_]
    395 
    396 	add	$key_,$key,#32
    397 	mov	$cnt,$rounds
    398 	b.eq	.Lcbc_dec
    399 
    400 	cmp	$rounds,#2
    401 	veor	$dat,$dat,$ivec
    402 	veor	$rndzero_n_last,q8,$rndlast
    403 	b.eq	.Lcbc_enc128
    404 
    405 	vld1.32	{$in0-$in1},[$key_]
    406 	add	$key_,$key,#16
    407 	add	$key4,$key,#16*4
    408 	add	$key5,$key,#16*5
    409 	aese	$dat,q8
    410 	aesmc	$dat,$dat
    411 	add	$key6,$key,#16*6
    412 	add	$key7,$key,#16*7
    413 	b	.Lenter_cbc_enc
    414 
    415 .align	4
    416 .Loop_cbc_enc:
    417 	aese	$dat,q8
    418 	aesmc	$dat,$dat
    419 	 vst1.8	{$ivec},[$out],#16
    420 .Lenter_cbc_enc:
    421 	aese	$dat,q9
    422 	aesmc	$dat,$dat
    423 	aese	$dat,$in0
    424 	aesmc	$dat,$dat
    425 	vld1.32	{q8},[$key4]
    426 	cmp	$rounds,#4
    427 	aese	$dat,$in1
    428 	aesmc	$dat,$dat
    429 	vld1.32	{q9},[$key5]
    430 	b.eq	.Lcbc_enc192
    431 
    432 	aese	$dat,q8
    433 	aesmc	$dat,$dat
    434 	vld1.32	{q8},[$key6]
    435 	aese	$dat,q9
    436 	aesmc	$dat,$dat
    437 	vld1.32	{q9},[$key7]
    438 	nop
    439 
    440 .Lcbc_enc192:
    441 	aese	$dat,q8
    442 	aesmc	$dat,$dat
    443 	 subs	$len,$len,#16
    444 	aese	$dat,q9
    445 	aesmc	$dat,$dat
    446 	 cclr	$step,eq
    447 	aese	$dat,q10
    448 	aesmc	$dat,$dat
    449 	aese	$dat,q11
    450 	aesmc	$dat,$dat
    451 	 vld1.8	{q8},[$inp],$step
    452 	aese	$dat,q12
    453 	aesmc	$dat,$dat
    454 	 veor	q8,q8,$rndzero_n_last
    455 	aese	$dat,q13
    456 	aesmc	$dat,$dat
    457 	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
    458 	aese	$dat,q14
    459 	aesmc	$dat,$dat
    460 	aese	$dat,q15
    461 	veor	$ivec,$dat,$rndlast
    462 	b.hs	.Loop_cbc_enc
    463 
    464 	vst1.8	{$ivec},[$out],#16
    465 	b	.Lcbc_done
    466 
    467 .align	5
    468 .Lcbc_enc128:
    469 	vld1.32	{$in0-$in1},[$key_]
    470 	aese	$dat,q8
    471 	aesmc	$dat,$dat
    472 	b	.Lenter_cbc_enc128
    473 .Loop_cbc_enc128:
    474 	aese	$dat,q8
    475 	aesmc	$dat,$dat
    476 	 vst1.8	{$ivec},[$out],#16
    477 .Lenter_cbc_enc128:
    478 	aese	$dat,q9
    479 	aesmc	$dat,$dat
    480 	 subs	$len,$len,#16
    481 	aese	$dat,$in0
    482 	aesmc	$dat,$dat
    483 	 cclr	$step,eq
    484 	aese	$dat,$in1
    485 	aesmc	$dat,$dat
    486 	aese	$dat,q10
    487 	aesmc	$dat,$dat
    488 	aese	$dat,q11
    489 	aesmc	$dat,$dat
    490 	 vld1.8	{q8},[$inp],$step
    491 	aese	$dat,q12
    492 	aesmc	$dat,$dat
    493 	aese	$dat,q13
    494 	aesmc	$dat,$dat
    495 	aese	$dat,q14
    496 	aesmc	$dat,$dat
    497 	 veor	q8,q8,$rndzero_n_last
    498 	aese	$dat,q15
    499 	veor	$ivec,$dat,$rndlast
    500 	b.hs	.Loop_cbc_enc128
    501 
    502 	vst1.8	{$ivec},[$out],#16
    503 	b	.Lcbc_done
    504 ___
    505 {
    506 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
    507 $code.=<<___;
    508 .align	5
    509 .Lcbc_dec:
    510 	vld1.8	{$dat2},[$inp],#16
    511 	subs	$len,$len,#32		// bias
    512 	add	$cnt,$rounds,#2
    513 	vorr	$in1,$dat,$dat
    514 	vorr	$dat1,$dat,$dat
    515 	vorr	$in2,$dat2,$dat2
    516 	b.lo	.Lcbc_dec_tail
    517 
    518 	vorr	$dat1,$dat2,$dat2
    519 	vld1.8	{$dat2},[$inp],#16
    520 	vorr	$in0,$dat,$dat
    521 	vorr	$in1,$dat1,$dat1
    522 	vorr	$in2,$dat2,$dat2
    523 
    524 .Loop3x_cbc_dec:
    525 	aesd	$dat0,q8
    526 	aesimc	$dat0,$dat0
    527 	aesd	$dat1,q8
    528 	aesimc	$dat1,$dat1
    529 	aesd	$dat2,q8
    530 	aesimc	$dat2,$dat2
    531 	vld1.32	{q8},[$key_],#16
    532 	subs	$cnt,$cnt,#2
    533 	aesd	$dat0,q9
    534 	aesimc	$dat0,$dat0
    535 	aesd	$dat1,q9
    536 	aesimc	$dat1,$dat1
    537 	aesd	$dat2,q9
    538 	aesimc	$dat2,$dat2
    539 	vld1.32	{q9},[$key_],#16
    540 	b.gt	.Loop3x_cbc_dec
    541 
    542 	aesd	$dat0,q8
    543 	aesimc	$dat0,$dat0
    544 	aesd	$dat1,q8
    545 	aesimc	$dat1,$dat1
    546 	aesd	$dat2,q8
    547 	aesimc	$dat2,$dat2
    548 	 veor	$tmp0,$ivec,$rndlast
    549 	 subs	$len,$len,#0x30
    550 	 veor	$tmp1,$in0,$rndlast
    551 	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
    552 	aesd	$dat0,q9
    553 	aesimc	$dat0,$dat0
    554 	aesd	$dat1,q9
    555 	aesimc	$dat1,$dat1
    556 	aesd	$dat2,q9
    557 	aesimc	$dat2,$dat2
    558 	 veor	$tmp2,$in1,$rndlast
    559 	 add	$inp,$inp,x6		// $inp is adjusted in such way that
    560 					// at exit from the loop $dat1-$dat2
    561 					// are loaded with last "words"
    562 	 vorr	$ivec,$in2,$in2
    563 	 mov	$key_,$key
    564 	aesd	$dat0,q12
    565 	aesimc	$dat0,$dat0
    566 	aesd	$dat1,q12
    567 	aesimc	$dat1,$dat1
    568 	aesd	$dat2,q12
    569 	aesimc	$dat2,$dat2
    570 	 vld1.8	{$in0},[$inp],#16
    571 	aesd	$dat0,q13
    572 	aesimc	$dat0,$dat0
    573 	aesd	$dat1,q13
    574 	aesimc	$dat1,$dat1
    575 	aesd	$dat2,q13
    576 	aesimc	$dat2,$dat2
    577 	 vld1.8	{$in1},[$inp],#16
    578 	aesd	$dat0,q14
    579 	aesimc	$dat0,$dat0
    580 	aesd	$dat1,q14
    581 	aesimc	$dat1,$dat1
    582 	aesd	$dat2,q14
    583 	aesimc	$dat2,$dat2
    584 	 vld1.8	{$in2},[$inp],#16
    585 	aesd	$dat0,q15
    586 	aesd	$dat1,q15
    587 	aesd	$dat2,q15
    588 	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
    589 	 add	$cnt,$rounds,#2
    590 	veor	$tmp0,$tmp0,$dat0
    591 	veor	$tmp1,$tmp1,$dat1
    592 	veor	$dat2,$dat2,$tmp2
    593 	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
    594 	vst1.8	{$tmp0},[$out],#16
    595 	 vorr	$dat0,$in0,$in0
    596 	vst1.8	{$tmp1},[$out],#16
    597 	 vorr	$dat1,$in1,$in1
    598 	vst1.8	{$dat2},[$out],#16
    599 	 vorr	$dat2,$in2,$in2
    600 	b.hs	.Loop3x_cbc_dec
    601 
    602 	cmn	$len,#0x30
    603 	b.eq	.Lcbc_done
    604 	nop
    605 
    606 .Lcbc_dec_tail:
    607 	aesd	$dat1,q8
    608 	aesimc	$dat1,$dat1
    609 	aesd	$dat2,q8
    610 	aesimc	$dat2,$dat2
    611 	vld1.32	{q8},[$key_],#16
    612 	subs	$cnt,$cnt,#2
    613 	aesd	$dat1,q9
    614 	aesimc	$dat1,$dat1
    615 	aesd	$dat2,q9
    616 	aesimc	$dat2,$dat2
    617 	vld1.32	{q9},[$key_],#16
    618 	b.gt	.Lcbc_dec_tail
    619 
    620 	aesd	$dat1,q8
    621 	aesimc	$dat1,$dat1
    622 	aesd	$dat2,q8
    623 	aesimc	$dat2,$dat2
    624 	aesd	$dat1,q9
    625 	aesimc	$dat1,$dat1
    626 	aesd	$dat2,q9
    627 	aesimc	$dat2,$dat2
    628 	aesd	$dat1,q12
    629 	aesimc	$dat1,$dat1
    630 	aesd	$dat2,q12
    631 	aesimc	$dat2,$dat2
    632 	 cmn	$len,#0x20
    633 	aesd	$dat1,q13
    634 	aesimc	$dat1,$dat1
    635 	aesd	$dat2,q13
    636 	aesimc	$dat2,$dat2
    637 	 veor	$tmp1,$ivec,$rndlast
    638 	aesd	$dat1,q14
    639 	aesimc	$dat1,$dat1
    640 	aesd	$dat2,q14
    641 	aesimc	$dat2,$dat2
    642 	 veor	$tmp2,$in1,$rndlast
    643 	aesd	$dat1,q15
    644 	aesd	$dat2,q15
    645 	b.eq	.Lcbc_dec_one
    646 	veor	$tmp1,$tmp1,$dat1
    647 	veor	$tmp2,$tmp2,$dat2
    648 	 vorr	$ivec,$in2,$in2
    649 	vst1.8	{$tmp1},[$out],#16
    650 	vst1.8	{$tmp2},[$out],#16
    651 	b	.Lcbc_done
    652 
    653 .Lcbc_dec_one:
    654 	veor	$tmp1,$tmp1,$dat2
    655 	 vorr	$ivec,$in2,$in2
    656 	vst1.8	{$tmp1},[$out],#16
    657 
    658 .Lcbc_done:
    659 	vst1.8	{$ivec},[$ivp]
    660 .Lcbc_abort:
    661 ___
    662 }
    663 $code.=<<___	if ($flavour !~ /64/);
    664 	vldmia	sp!,{d8-d15}
    665 	ldmia	sp!,{r4-r8,pc}
    666 ___
    667 $code.=<<___	if ($flavour =~ /64/);
    668 	ldr	x29,[sp],#16
    669 	ret
    670 ___
    671 $code.=<<___;
    672 .size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
    673 ___
    674 }}}
    675 {{{
    676 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
    677 my ($rounds,$cnt,$key_)=("w5","w6","x7");
    678 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
    679 my $step="x12";		# aliases with $tctr2
    680 
    681 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
    682 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
    683 
    684 my ($dat,$tmp)=($dat0,$tmp0);
    685 
    686 ### q8-q15	preloaded key schedule
    687 
    688 $code.=<<___;
    689 .globl	${prefix}_ctr32_encrypt_blocks
    690 .type	${prefix}_ctr32_encrypt_blocks,%function
    691 .align	5
    692 ${prefix}_ctr32_encrypt_blocks:
    693 ___
    694 $code.=<<___	if ($flavour =~ /64/);
    695 	stp		x29,x30,[sp,#-16]!
    696 	add		x29,sp,#0
    697 ___
    698 $code.=<<___	if ($flavour !~ /64/);
    699 	mov		ip,sp
    700 	stmdb		sp!,{r4-r10,lr}
    701 	vstmdb		sp!,{d8-d15}            @ ABI specification says so
    702 	ldr		r4, [ip]		@ load remaining arg
    703 ___
    704 $code.=<<___;
    705 	ldr		$rounds,[$key,#240]
    706 
    707 	ldr		$ctr, [$ivp, #12]
    708 	vld1.32		{$dat0},[$ivp]
    709 
    710 	vld1.32		{q8-q9},[$key]		// load key schedule...
    711 	sub		$rounds,$rounds,#4
    712 	mov		$step,#16
    713 	cmp		$len,#2
    714 	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
    715 	sub		$rounds,$rounds,#2
    716 	vld1.32		{q12-q13},[$key_],#32
    717 	vld1.32		{q14-q15},[$key_],#32
    718 	vld1.32		{$rndlast},[$key_]
    719 	add		$key_,$key,#32
    720 	mov		$cnt,$rounds
    721 	cclr		$step,lo
    722 #ifndef __ARMEB__
    723 	rev		$ctr, $ctr
    724 #endif
    725 	vorr		$dat1,$dat0,$dat0
    726 	add		$tctr1, $ctr, #1
    727 	vorr		$dat2,$dat0,$dat0
    728 	add		$ctr, $ctr, #2
    729 	vorr		$ivec,$dat0,$dat0
    730 	rev		$tctr1, $tctr1
    731 	vmov.32		${dat1}[3],$tctr1
    732 	b.ls		.Lctr32_tail
    733 	rev		$tctr2, $ctr
    734 	sub		$len,$len,#3		// bias
    735 	vmov.32		${dat2}[3],$tctr2
    736 	b		.Loop3x_ctr32
    737 
    738 .align	4
    739 .Loop3x_ctr32:
    740 	aese		$dat0,q8
    741 	aesmc		$dat0,$dat0
    742 	aese		$dat1,q8
    743 	aesmc		$dat1,$dat1
    744 	aese		$dat2,q8
    745 	aesmc		$dat2,$dat2
    746 	vld1.32		{q8},[$key_],#16
    747 	subs		$cnt,$cnt,#2
    748 	aese		$dat0,q9
    749 	aesmc		$dat0,$dat0
    750 	aese		$dat1,q9
    751 	aesmc		$dat1,$dat1
    752 	aese		$dat2,q9
    753 	aesmc		$dat2,$dat2
    754 	vld1.32		{q9},[$key_],#16
    755 	b.gt		.Loop3x_ctr32
    756 
    757 	aese		$dat0,q8
    758 	aesmc		$tmp0,$dat0
    759 	aese		$dat1,q8
    760 	aesmc		$tmp1,$dat1
    761 	 vld1.8		{$in0},[$inp],#16
    762 	 vorr		$dat0,$ivec,$ivec
    763 	aese		$dat2,q8
    764 	aesmc		$dat2,$dat2
    765 	 vld1.8		{$in1},[$inp],#16
    766 	 vorr		$dat1,$ivec,$ivec
    767 	aese		$tmp0,q9
    768 	aesmc		$tmp0,$tmp0
    769 	aese		$tmp1,q9
    770 	aesmc		$tmp1,$tmp1
    771 	 vld1.8		{$in2},[$inp],#16
    772 	 mov		$key_,$key
    773 	aese		$dat2,q9
    774 	aesmc		$tmp2,$dat2
    775 	 vorr		$dat2,$ivec,$ivec
    776 	 add		$tctr0,$ctr,#1
    777 	aese		$tmp0,q12
    778 	aesmc		$tmp0,$tmp0
    779 	aese		$tmp1,q12
    780 	aesmc		$tmp1,$tmp1
    781 	 veor		$in0,$in0,$rndlast
    782 	 add		$tctr1,$ctr,#2
    783 	aese		$tmp2,q12
    784 	aesmc		$tmp2,$tmp2
    785 	 veor		$in1,$in1,$rndlast
    786 	 add		$ctr,$ctr,#3
    787 	aese		$tmp0,q13
    788 	aesmc		$tmp0,$tmp0
    789 	aese		$tmp1,q13
    790 	aesmc		$tmp1,$tmp1
    791 	 veor		$in2,$in2,$rndlast
    792 	 rev		$tctr0,$tctr0
    793 	aese		$tmp2,q13
    794 	aesmc		$tmp2,$tmp2
    795 	 vmov.32	${dat0}[3], $tctr0
    796 	 rev		$tctr1,$tctr1
    797 	aese		$tmp0,q14
    798 	aesmc		$tmp0,$tmp0
    799 	aese		$tmp1,q14
    800 	aesmc		$tmp1,$tmp1
    801 	 vmov.32	${dat1}[3], $tctr1
    802 	 rev		$tctr2,$ctr
    803 	aese		$tmp2,q14
    804 	aesmc		$tmp2,$tmp2
    805 	 vmov.32	${dat2}[3], $tctr2
    806 	 subs		$len,$len,#3
    807 	aese		$tmp0,q15
    808 	aese		$tmp1,q15
    809 	aese		$tmp2,q15
    810 
    811 	veor		$in0,$in0,$tmp0
    812 	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
    813 	vst1.8		{$in0},[$out],#16
    814 	veor		$in1,$in1,$tmp1
    815 	 mov		$cnt,$rounds
    816 	vst1.8		{$in1},[$out],#16
    817 	veor		$in2,$in2,$tmp2
    818 	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
    819 	vst1.8		{$in2},[$out],#16
    820 	b.hs		.Loop3x_ctr32
    821 
    822 	adds		$len,$len,#3
    823 	b.eq		.Lctr32_done
    824 	cmp		$len,#1
    825 	mov		$step,#16
    826 	cclr		$step,eq
    827 
    828 .Lctr32_tail:
    829 	aese		$dat0,q8
    830 	aesmc		$dat0,$dat0
    831 	aese		$dat1,q8
    832 	aesmc		$dat1,$dat1
    833 	vld1.32		{q8},[$key_],#16
    834 	subs		$cnt,$cnt,#2
    835 	aese		$dat0,q9
    836 	aesmc		$dat0,$dat0
    837 	aese		$dat1,q9
    838 	aesmc		$dat1,$dat1
    839 	vld1.32		{q9},[$key_],#16
    840 	b.gt		.Lctr32_tail
    841 
    842 	aese		$dat0,q8
    843 	aesmc		$dat0,$dat0
    844 	aese		$dat1,q8
    845 	aesmc		$dat1,$dat1
    846 	aese		$dat0,q9
    847 	aesmc		$dat0,$dat0
    848 	aese		$dat1,q9
    849 	aesmc		$dat1,$dat1
    850 	 vld1.8		{$in0},[$inp],$step
    851 	aese		$dat0,q12
    852 	aesmc		$dat0,$dat0
    853 	aese		$dat1,q12
    854 	aesmc		$dat1,$dat1
    855 	 vld1.8		{$in1},[$inp]
    856 	aese		$dat0,q13
    857 	aesmc		$dat0,$dat0
    858 	aese		$dat1,q13
    859 	aesmc		$dat1,$dat1
    860 	 veor		$in0,$in0,$rndlast
    861 	aese		$dat0,q14
    862 	aesmc		$dat0,$dat0
    863 	aese		$dat1,q14
    864 	aesmc		$dat1,$dat1
    865 	 veor		$in1,$in1,$rndlast
    866 	aese		$dat0,q15
    867 	aese		$dat1,q15
    868 
    869 	cmp		$len,#1
    870 	veor		$in0,$in0,$dat0
    871 	veor		$in1,$in1,$dat1
    872 	vst1.8		{$in0},[$out],#16
    873 	b.eq		.Lctr32_done
    874 	vst1.8		{$in1},[$out]
    875 
    876 .Lctr32_done:
    877 ___
    878 $code.=<<___	if ($flavour !~ /64/);
    879 	vldmia		sp!,{d8-d15}
    880 	ldmia		sp!,{r4-r10,pc}
    881 ___
    882 $code.=<<___	if ($flavour =~ /64/);
    883 	ldr		x29,[sp],#16
    884 	ret
    885 ___
    886 $code.=<<___;
    887 .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
    888 ___
    889 }}}
    890 $code.=<<___;
    891 #endif
    892 ___
    893 ########################################
    894 if ($flavour =~ /64/) {			######## 64-bit code
    895     my %opcode = (
    896 	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
    897 	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
    898 
    899     local *unaes = sub {
    900 	my ($mnemonic,$arg)=@_;
    901 
    902 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
    903 	sprintf ".inst\t0x%08x\t//%s %s",
    904 			$opcode{$mnemonic}|$1|($2<<5),
    905 			$mnemonic,$arg;
    906     };
    907 
    908     foreach(split("\n",$code)) {
    909 	s/\`([^\`]*)\`/eval($1)/geo;
    910 
    911 	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
    912 	s/@\s/\/\//o;			# old->new style commentary
    913 
    914 	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
    915 	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
    916 	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
    917 	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
    918 	s/vext\.8/ext/o		or
    919 	s/vrev32\.8/rev32/o	or
    920 	s/vtst\.8/cmtst/o	or
    921 	s/vshr/ushr/o		or
    922 	s/^(\s+)v/$1/o		or	# strip off v prefix
    923 	s/\bbx\s+lr\b/ret/o;
    924 
    925 	# fix up remainig legacy suffixes
    926 	s/\.[ui]?8//o;
    927 	m/\],#8/o and s/\.16b/\.8b/go;
    928 	s/\.[ui]?32//o and s/\.16b/\.4s/go;
    929 	s/\.[ui]?64//o and s/\.16b/\.2d/go;
    930 	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
    931 
    932 	print $_,"\n";
    933     }
    934 } else {				######## 32-bit code
    935     my %opcode = (
    936 	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
    937 	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
    938 
    939     local *unaes = sub {
    940 	my ($mnemonic,$arg)=@_;
    941 
    942 	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
    943 	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
    944 					 |(($2&7)<<1) |(($2&8)<<2);
    945 	    # since ARMv7 instructions are always encoded little-endian.
    946 	    # correct solution is to use .inst directive, but older
    947 	    # assemblers don't implement it:-(
    948 	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
    949 			$word&0xff,($word>>8)&0xff,
    950 			($word>>16)&0xff,($word>>24)&0xff,
    951 			$mnemonic,$arg;
    952 	}
    953     };
    954 
    955     sub unvtbl {
    956 	my $arg=shift;
    957 
    958 	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
    959 	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
    960 		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;	
    961     }
    962 
    963     sub unvdup32 {
    964 	my $arg=shift;
    965 
    966 	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
    967 	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;	
    968     }
    969 
    970     sub unvmov32 {
    971 	my $arg=shift;
    972 
    973 	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
    974 	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;	
    975     }
    976 
    977     foreach(split("\n",$code)) {
    978 	s/\`([^\`]*)\`/eval($1)/geo;
    979 
    980 	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
    981 	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
    982 	s/\/\/\s?/@ /o;				# new->old style commentary
    983 
    984 	# fix up remainig new-style suffixes
    985 	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
    986 	s/\],#[0-9]+/]!/o;
    987 
    988 	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
    989 	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
    990 	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
    991 	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
    992 	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
    993 	s/^(\s+)b\./$1b/o				or
    994 	s/^(\s+)mov\./$1mov/o				or
    995 	s/^(\s+)ret/$1bx\tlr/o;
    996 
    997 	print $_,"\n";
    998     }
    999 }
   1000 
   1001 close STDOUT;
   1002