Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # This module implements support for ARMv8 AES instructions. The
     11 # module is endian-agnostic in sense that it supports both big- and
     12 # little-endian cases. As does it support both 32- and 64-bit modes
     13 # of operation. Latter is achieved by limiting amount of utilized
     14 # registers to 16, which implies additional instructions. This has
     15 # no effect on mighty Apple A7, as results are literally equal to
     16 # the theoretical estimates based on instruction latencies and issue
     17 # rate. It remains to be seen how does it affect other platforms...
     18 #
     19 # Performance in cycles per byte processed with 128-bit key:
     20 #
     21 #		CBC enc		CBC dec		CTR
     22 # Apple A7	2.39		1.20		1.20
     23 # Cortex-A5x	n/a		n/a		n/a
     24 
     25 $flavour = shift;
     26 open STDOUT,">".shift;
     27 
     28 $prefix="aes_v8";
     29 
     30 $code=<<___;
     31 #include "arm_arch.h"
     32 
     33 #if __ARM_ARCH__>=7
     34 .text
     35 ___
     36 $code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
     37 $code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
     38 
     39 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
     40 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
     41 # maintain both 32- and 64-bit codes within single module and
     42 # transliterate common code to either flavour with regex vodoo.
     43 #
     44 {{{
     45 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
     46 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
     47 	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
     48 
     49 
     50 $code.=<<___;
     51 .align	5
     52 rcon:
     53 .long	0x01,0x01,0x01,0x01
     54 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
     55 .long	0x1b,0x1b,0x1b,0x1b
     56 
     57 .globl	${prefix}_set_encrypt_key
     58 .type	${prefix}_set_encrypt_key,%function
     59 .align	5
     60 ${prefix}_set_encrypt_key:
     61 .Lenc_key:
     62 ___
     63 $code.=<<___	if ($flavour =~ /64/);
     64 	stp	x29,x30,[sp,#-16]!
     65 	add	x29,sp,#0
     66 ___
     67 $code.=<<___;
     68 	adr	$ptr,rcon
     69 	cmp	$bits,#192
     70 
     71 	veor	$zero,$zero,$zero
     72 	vld1.8	{$in0},[$inp],#16
     73 	mov	$bits,#8		// reuse $bits
     74 	vld1.32	{$rcon,$mask},[$ptr],#32
     75 
     76 	b.lt	.Loop128
     77 	b.eq	.L192
     78 	b	.L256
     79 
     80 .align	4
     81 .Loop128:
     82 	vtbl.8	$key,{$in0},$mask
     83 	vext.8	$tmp,$zero,$in0,#12
     84 	vst1.32	{$in0},[$out],#16
     85 	aese	$key,$zero
     86 	subs	$bits,$bits,#1
     87 
     88 	veor	$in0,$in0,$tmp
     89 	vext.8	$tmp,$zero,$tmp,#12
     90 	veor	$in0,$in0,$tmp
     91 	vext.8	$tmp,$zero,$tmp,#12
     92 	 veor	$key,$key,$rcon
     93 	veor	$in0,$in0,$tmp
     94 	vshl.u8	$rcon,$rcon,#1
     95 	veor	$in0,$in0,$key
     96 	b.ne	.Loop128
     97 
     98 	vld1.32	{$rcon},[$ptr]
     99 
    100 	vtbl.8	$key,{$in0},$mask
    101 	vext.8	$tmp,$zero,$in0,#12
    102 	vst1.32	{$in0},[$out],#16
    103 	aese	$key,$zero
    104 
    105 	veor	$in0,$in0,$tmp
    106 	vext.8	$tmp,$zero,$tmp,#12
    107 	veor	$in0,$in0,$tmp
    108 	vext.8	$tmp,$zero,$tmp,#12
    109 	 veor	$key,$key,$rcon
    110 	veor	$in0,$in0,$tmp
    111 	vshl.u8	$rcon,$rcon,#1
    112 	veor	$in0,$in0,$key
    113 
    114 	vtbl.8	$key,{$in0},$mask
    115 	vext.8	$tmp,$zero,$in0,#12
    116 	vst1.32	{$in0},[$out],#16
    117 	aese	$key,$zero
    118 
    119 	veor	$in0,$in0,$tmp
    120 	vext.8	$tmp,$zero,$tmp,#12
    121 	veor	$in0,$in0,$tmp
    122 	vext.8	$tmp,$zero,$tmp,#12
    123 	 veor	$key,$key,$rcon
    124 	veor	$in0,$in0,$tmp
    125 	veor	$in0,$in0,$key
    126 	vst1.32	{$in0},[$out]
    127 	add	$out,$out,#0x50
    128 
    129 	mov	$rounds,#10
    130 	b	.Ldone
    131 
    132 .align	4
    133 .L192:
    134 	vld1.8	{$in1},[$inp],#8
    135 	vmov.i8	$key,#8			// borrow $key
    136 	vst1.32	{$in0},[$out],#16
    137 	vsub.i8	$mask,$mask,$key	// adjust the mask
    138 
    139 .Loop192:
    140 	vtbl.8	$key,{$in1},$mask
    141 	vext.8	$tmp,$zero,$in0,#12
    142 	vst1.32	{$in1},[$out],#8
    143 	aese	$key,$zero
    144 	subs	$bits,$bits,#1
    145 
    146 	veor	$in0,$in0,$tmp
    147 	vext.8	$tmp,$zero,$tmp,#12
    148 	veor	$in0,$in0,$tmp
    149 	vext.8	$tmp,$zero,$tmp,#12
    150 	veor	$in0,$in0,$tmp
    151 
    152 	vdup.32	$tmp,${in0}[3]
    153 	veor	$tmp,$tmp,$in1
    154 	 veor	$key,$key,$rcon
    155 	vext.8	$in1,$zero,$in1,#12
    156 	vshl.u8	$rcon,$rcon,#1
    157 	veor	$in1,$in1,$tmp
    158 	veor	$in0,$in0,$key
    159 	veor	$in1,$in1,$key
    160 	vst1.32	{$in0},[$out],#16
    161 	b.ne	.Loop192
    162 
    163 	mov	$rounds,#12
    164 	add	$out,$out,#0x20
    165 	b	.Ldone
    166 
    167 .align	4
    168 .L256:
    169 	vld1.8	{$in1},[$inp]
    170 	mov	$bits,#7
    171 	mov	$rounds,#14
    172 	vst1.32	{$in0},[$out],#16
    173 
    174 .Loop256:
    175 	vtbl.8	$key,{$in1},$mask
    176 	vext.8	$tmp,$zero,$in0,#12
    177 	vst1.32	{$in1},[$out],#16
    178 	aese	$key,$zero
    179 	subs	$bits,$bits,#1
    180 
    181 	veor	$in0,$in0,$tmp
    182 	vext.8	$tmp,$zero,$tmp,#12
    183 	veor	$in0,$in0,$tmp
    184 	vext.8	$tmp,$zero,$tmp,#12
    185 	 veor	$key,$key,$rcon
    186 	veor	$in0,$in0,$tmp
    187 	vshl.u8	$rcon,$rcon,#1
    188 	veor	$in0,$in0,$key
    189 	vst1.32	{$in0},[$out],#16
    190 	b.eq	.Ldone
    191 
    192 	vdup.32	$key,${in0}[3]		// just splat
    193 	vext.8	$tmp,$zero,$in1,#12
    194 	aese	$key,$zero
    195 
    196 	veor	$in1,$in1,$tmp
    197 	vext.8	$tmp,$zero,$tmp,#12
    198 	veor	$in1,$in1,$tmp
    199 	vext.8	$tmp,$zero,$tmp,#12
    200 	veor	$in1,$in1,$tmp
    201 
    202 	veor	$in1,$in1,$key
    203 	b	.Loop256
    204 
    205 .Ldone:
    206 	str	$rounds,[$out]
    207 
    208 	eor	x0,x0,x0		// return value
    209 	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
    210 	ret
    211 .size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
    212 
    213 .globl	${prefix}_set_decrypt_key
    214 .type	${prefix}_set_decrypt_key,%function
    215 .align	5
    216 ${prefix}_set_decrypt_key:
    217 ___
    218 $code.=<<___	if ($flavour =~ /64/);
    219 	stp	x29,x30,[sp,#-16]!
    220 	add	x29,sp,#0
    221 ___
    222 $code.=<<___	if ($flavour !~ /64/);
    223 	stmdb	sp!,{r4,lr}
    224 ___
    225 $code.=<<___;
    226 	bl	.Lenc_key
    227 
    228 	sub	$out,$out,#240		// restore original $out
    229 	mov	x4,#-16
    230 	add	$inp,$out,x12,lsl#4	// end of key schedule
    231 
    232 	vld1.32	{v0.16b},[$out]
    233 	vld1.32	{v1.16b},[$inp]
    234 	vst1.32	{v0.16b},[$inp],x4
    235 	vst1.32	{v1.16b},[$out],#16
    236 
    237 .Loop_imc:
    238 	vld1.32	{v0.16b},[$out]
    239 	vld1.32	{v1.16b},[$inp]
    240 	aesimc	v0.16b,v0.16b
    241 	aesimc	v1.16b,v1.16b
    242 	vst1.32	{v0.16b},[$inp],x4
    243 	vst1.32	{v1.16b},[$out],#16
    244 	cmp	$inp,$out
    245 	b.hi	.Loop_imc
    246 
    247 	vld1.32	{v0.16b},[$out]
    248 	aesimc	v0.16b,v0.16b
    249 	vst1.32	{v0.16b},[$inp]
    250 
    251 	eor	x0,x0,x0		// return value
    252 ___
    253 $code.=<<___	if ($flavour !~ /64/);
    254 	ldmia	sp!,{r4,pc}
    255 ___
    256 $code.=<<___	if ($flavour =~ /64/);
    257 	ldp	x29,x30,[sp],#16
    258 	ret
    259 ___
    260 $code.=<<___;
    261 .size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
    262 ___
    263 }}}
    264 {{{
    265 sub gen_block () {
    266 my $dir = shift;
    267 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
    268 my ($inp,$out,$key)=map("x$_",(0..2));
    269 my $rounds="w3";
    270 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
    271 
    272 $code.=<<___;
    273 .globl	${prefix}_${dir}crypt
    274 .type	${prefix}_${dir}crypt,%function
    275 .align	5
    276 ${prefix}_${dir}crypt:
    277 	ldr	$rounds,[$key,#240]
    278 	vld1.32	{$rndkey0},[$key],#16
    279 	vld1.8	{$inout},[$inp]
    280 	sub	$rounds,$rounds,#2
    281 	vld1.32	{$rndkey1},[$key],#16
    282 
    283 .Loop_${dir}c:
    284 	aes$e	$inout,$rndkey0
    285 	vld1.32	{$rndkey0},[$key],#16
    286 	aes$mc	$inout,$inout
    287 	subs	$rounds,$rounds,#2
    288 	aes$e	$inout,$rndkey1
    289 	vld1.32	{$rndkey1},[$key],#16
    290 	aes$mc	$inout,$inout
    291 	b.gt	.Loop_${dir}c
    292 
    293 	aes$e	$inout,$rndkey0
    294 	vld1.32	{$rndkey0},[$key]
    295 	aes$mc	$inout,$inout
    296 	aes$e	$inout,$rndkey1
    297 	veor	$inout,$inout,$rndkey0
    298 
    299 	vst1.8	{$inout},[$out]
    300 	ret
    301 .size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
    302 ___
    303 }
    304 &gen_block("en");
    305 &gen_block("de");
    306 }}}
    307 {{{
    308 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
    309 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
    310 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
    311 
    312 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
    313 
    314 ### q8-q15	preloaded key schedule
    315 
    316 $code.=<<___;
    317 .globl	${prefix}_cbc_encrypt
    318 .type	${prefix}_cbc_encrypt,%function
    319 .align	5
    320 ${prefix}_cbc_encrypt:
    321 ___
    322 $code.=<<___	if ($flavour =~ /64/);
    323 	stp	x29,x30,[sp,#-16]!
    324 	add	x29,sp,#0
    325 ___
    326 $code.=<<___	if ($flavour !~ /64/);
    327 	mov	ip,sp
    328 	stmdb	sp!,{r4-r8,lr}
    329 	vstmdb	sp!,{d8-d15}            @ ABI specification says so
    330 	ldmia	ip,{r4-r5}		@ load remaining args
    331 ___
    332 $code.=<<___;
    333 	subs	$len,$len,#16
    334 	mov	$step,#16
    335 	b.lo	.Lcbc_abort
    336 	cclr	$step,eq
    337 
    338 	cmp	$enc,#0			// en- or decrypting?
    339 	ldr	$rounds,[$key,#240]
    340 	and	$len,$len,#-16
    341 	vld1.8	{$ivec},[$ivp]
    342 	vld1.8	{$dat},[$inp],$step
    343 
    344 	vld1.32	{q8-q9},[$key]		// load key schedule...
    345 	sub	$rounds,$rounds,#6
    346 	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
    347 	sub	$rounds,$rounds,#2
    348 	vld1.32	{q10-q11},[$key_],#32
    349 	vld1.32	{q12-q13},[$key_],#32
    350 	vld1.32	{q14-q15},[$key_],#32
    351 	vld1.32	{$rndlast},[$key_]
    352 
    353 	add	$key_,$key,#32
    354 	mov	$cnt,$rounds
    355 	b.eq	.Lcbc_dec
    356 
    357 	cmp	$rounds,#2
    358 	veor	$dat,$dat,$ivec
    359 	veor	$rndzero_n_last,q8,$rndlast
    360 	b.eq	.Lcbc_enc128
    361 
    362 .Loop_cbc_enc:
    363 	aese	$dat,q8
    364 	vld1.32	{q8},[$key_],#16
    365 	aesmc	$dat,$dat
    366 	subs	$cnt,$cnt,#2
    367 	aese	$dat,q9
    368 	vld1.32	{q9},[$key_],#16
    369 	aesmc	$dat,$dat
    370 	b.gt	.Loop_cbc_enc
    371 
    372 	aese	$dat,q8
    373 	aesmc	$dat,$dat
    374 	 subs	$len,$len,#16
    375 	aese	$dat,q9
    376 	aesmc	$dat,$dat
    377 	 cclr	$step,eq
    378 	aese	$dat,q10
    379 	aesmc	$dat,$dat
    380 	 add	$key_,$key,#16
    381 	aese	$dat,q11
    382 	aesmc	$dat,$dat
    383 	 vld1.8	{q8},[$inp],$step
    384 	aese	$dat,q12
    385 	aesmc	$dat,$dat
    386 	 veor	q8,q8,$rndzero_n_last
    387 	aese	$dat,q13
    388 	aesmc	$dat,$dat
    389 	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
    390 	aese	$dat,q14
    391 	aesmc	$dat,$dat
    392 	aese	$dat,q15
    393 
    394 	 mov	$cnt,$rounds
    395 	veor	$ivec,$dat,$rndlast
    396 	vst1.8	{$ivec},[$out],#16
    397 	b.hs	.Loop_cbc_enc
    398 
    399 	b	.Lcbc_done
    400 
    401 .align	5
    402 .Lcbc_enc128:
    403 	vld1.32	{$in0-$in1},[$key_]
    404 	aese	$dat,q8
    405 	aesmc	$dat,$dat
    406 	b	.Lenter_cbc_enc128
    407 .Loop_cbc_enc128:
    408 	aese	$dat,q8
    409 	aesmc	$dat,$dat
    410 	 vst1.8	{$ivec},[$out],#16
    411 .Lenter_cbc_enc128:
    412 	aese	$dat,q9
    413 	aesmc	$dat,$dat
    414 	 subs	$len,$len,#16
    415 	aese	$dat,$in0
    416 	aesmc	$dat,$dat
    417 	 cclr	$step,eq
    418 	aese	$dat,$in1
    419 	aesmc	$dat,$dat
    420 	aese	$dat,q10
    421 	aesmc	$dat,$dat
    422 	aese	$dat,q11
    423 	aesmc	$dat,$dat
    424 	 vld1.8	{q8},[$inp],$step
    425 	aese	$dat,q12
    426 	aesmc	$dat,$dat
    427 	aese	$dat,q13
    428 	aesmc	$dat,$dat
    429 	aese	$dat,q14
    430 	aesmc	$dat,$dat
    431 	 veor	q8,q8,$rndzero_n_last
    432 	aese	$dat,q15
    433 	veor	$ivec,$dat,$rndlast
    434 	b.hs	.Loop_cbc_enc128
    435 
    436 	vst1.8	{$ivec},[$out],#16
    437 	b	.Lcbc_done
    438 
    439 .align	5
    440 .Lcbc_dec128:
    441 	vld1.32	{$tmp0-$tmp1},[$key_]
    442 	veor	$ivec,$ivec,$rndlast
    443 	veor	$in0,$dat0,$rndlast
    444 	mov	$step1,$step
    445 
    446 .Loop2x_cbc_dec128:
    447 	aesd	$dat0,q8
    448 	aesd	$dat1,q8
    449 	aesimc	$dat0,$dat0
    450 	aesimc	$dat1,$dat1
    451 	 subs	$len,$len,#32
    452 	aesd	$dat0,q9
    453 	aesd	$dat1,q9
    454 	aesimc	$dat0,$dat0
    455 	aesimc	$dat1,$dat1
    456 	 cclr	$step,lo
    457 	aesd	$dat0,$tmp0
    458 	aesd	$dat1,$tmp0
    459 	aesimc	$dat0,$dat0
    460 	aesimc	$dat1,$dat1
    461 	 cclr	$step1,ls
    462 	aesd	$dat0,$tmp1
    463 	aesd	$dat1,$tmp1
    464 	aesimc	$dat0,$dat0
    465 	aesimc	$dat1,$dat1
    466 	aesd	$dat0,q10
    467 	aesd	$dat1,q10
    468 	aesimc	$dat0,$dat0
    469 	aesimc	$dat1,$dat1
    470 	aesd	$dat0,q11
    471 	aesd	$dat1,q11
    472 	aesimc	$dat0,$dat0
    473 	aesimc	$dat1,$dat1
    474 	aesd	$dat0,q12
    475 	aesd	$dat1,q12
    476 	aesimc	$dat0,$dat0
    477 	aesimc	$dat1,$dat1
    478 	aesd	$dat0,q13
    479 	aesd	$dat1,q13
    480 	aesimc	$dat0,$dat0
    481 	aesimc	$dat1,$dat1
    482 	aesd	$dat0,q14
    483 	aesd	$dat1,q14
    484 	aesimc	$dat0,$dat0
    485 	aesimc	$dat1,$dat1
    486 	aesd	$dat0,q15
    487 	aesd	$dat1,q15
    488 
    489 	veor	$ivec,$ivec,$dat0
    490 	vld1.8	{$dat0},[$inp],$step
    491 	veor	$in0,$in0,$dat1
    492 	vld1.8	{$dat1},[$inp],$step1
    493 	vst1.8	{$ivec},[$out],#16
    494 	veor	$ivec,$in1,$rndlast
    495 	vst1.8	{$in0},[$out],#16
    496 	veor	$in0,$dat0,$rndlast
    497 	vorr	$in1,$dat1,$dat1
    498 	b.hs	.Loop2x_cbc_dec128
    499 
    500 	adds	$len,$len,#32
    501 	veor	$ivec,$ivec,$rndlast
    502 	b.eq	.Lcbc_done
    503 	veor	$in0,$in0,$rndlast
    504 	b	.Lcbc_dec_tail
    505 
    506 .align	5
    507 .Lcbc_dec:
    508 	subs	$len,$len,#16
    509 	vorr	$in0,$dat,$dat
    510 	b.lo	.Lcbc_dec_tail
    511 
    512 	cclr	$step,eq
    513 	cmp	$rounds,#2
    514 	vld1.8	{$dat1},[$inp],$step
    515 	vorr	$in1,$dat1,$dat1
    516 	b.eq	.Lcbc_dec128
    517 
    518 .Loop2x_cbc_dec:
    519 	aesd	$dat0,q8
    520 	aesd	$dat1,q8
    521 	vld1.32	{q8},[$key_],#16
    522 	aesimc	$dat0,$dat0
    523 	aesimc	$dat1,$dat1
    524 	subs	$cnt,$cnt,#2
    525 	aesd	$dat0,q9
    526 	aesd	$dat1,q9
    527 	vld1.32	{q9},[$key_],#16
    528 	aesimc	$dat0,$dat0
    529 	aesimc	$dat1,$dat1
    530 	b.gt	.Loop2x_cbc_dec
    531 
    532 	aesd	$dat0,q8
    533 	aesd	$dat1,q8
    534 	aesimc	$dat0,$dat0
    535 	aesimc	$dat1,$dat1
    536 	 veor	$tmp0,$ivec,$rndlast
    537 	 veor	$tmp1,$in0,$rndlast
    538 	aesd	$dat0,q9
    539 	aesd	$dat1,q9
    540 	aesimc	$dat0,$dat0
    541 	aesimc	$dat1,$dat1
    542 	 vorr	$ivec,$in1,$in1
    543 	 subs	$len,$len,#32
    544 	aesd	$dat0,q10
    545 	aesd	$dat1,q10
    546 	aesimc	$dat0,$dat0
    547 	 cclr	$step,lo
    548 	aesimc	$dat1,$dat1
    549 	 mov	$key_,$key
    550 	aesd	$dat0,q11
    551 	aesd	$dat1,q11
    552 	aesimc	$dat0,$dat0
    553 	 vld1.8	{$in0},[$inp],$step
    554 	aesimc	$dat1,$dat1
    555 	 cclr	$step,ls
    556 	aesd	$dat0,q12
    557 	aesd	$dat1,q12
    558 	aesimc	$dat0,$dat0
    559 	aesimc	$dat1,$dat1
    560 	 vld1.8	{$in1},[$inp],$step
    561 	aesd	$dat0,q13
    562 	aesd	$dat1,q13
    563 	aesimc	$dat0,$dat0
    564 	aesimc	$dat1,$dat1
    565 	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
    566 	aesd	$dat0,q14
    567 	aesd	$dat1,q14
    568 	aesimc	$dat0,$dat0
    569 	aesimc	$dat1,$dat1
    570 	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
    571 	aesd	$dat0,q15
    572 	aesd	$dat1,q15
    573 
    574 	 mov	$cnt,$rounds
    575 	veor	$tmp0,$tmp0,$dat0
    576 	veor	$tmp1,$tmp1,$dat1
    577 	 vorr	$dat0,$in0,$in0
    578 	vst1.8	{$tmp0},[$out],#16
    579 	 vorr	$dat1,$in1,$in1
    580 	vst1.8	{$tmp1},[$out],#16
    581 	b.hs	.Loop2x_cbc_dec
    582 
    583 	adds	$len,$len,#32
    584 	b.eq	.Lcbc_done
    585 
    586 .Lcbc_dec_tail:
    587 	aesd	$dat,q8
    588 	vld1.32	{q8},[$key_],#16
    589 	aesimc	$dat,$dat
    590 	subs	$cnt,$cnt,#2
    591 	aesd	$dat,q9
    592 	vld1.32	{q9},[$key_],#16
    593 	aesimc	$dat,$dat
    594 	b.gt	.Lcbc_dec_tail
    595 
    596 	aesd	$dat,q8
    597 	aesimc	$dat,$dat
    598 	aesd	$dat,q9
    599 	aesimc	$dat,$dat
    600 	 veor	$tmp,$ivec,$rndlast
    601 	aesd	$dat,q10
    602 	aesimc	$dat,$dat
    603 	 vorr	$ivec,$in0,$in0
    604 	aesd	$dat,q11
    605 	aesimc	$dat,$dat
    606 	aesd	$dat,q12
    607 	aesimc	$dat,$dat
    608 	aesd	$dat,q13
    609 	aesimc	$dat,$dat
    610 	aesd	$dat,q14
    611 	aesimc	$dat,$dat
    612 	aesd	$dat,q15
    613 
    614 	veor	$tmp,$tmp,$dat
    615 	vst1.8	{$tmp},[$out],#16
    616 
    617 .Lcbc_done:
    618 	vst1.8	{$ivec},[$ivp]
    619 .Lcbc_abort:
    620 ___
    621 $code.=<<___	if ($flavour !~ /64/);
    622 	vldmia	sp!,{d8-d15}
    623 	ldmia	sp!,{r4-r8,pc}
    624 ___
    625 $code.=<<___	if ($flavour =~ /64/);
    626 	ldr	x29,[sp],#16
    627 	ret
    628 ___
    629 $code.=<<___;
    630 .size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
    631 ___
    632 }}}
    633 {{{
    634 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
    635 my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
    636 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
    637 
    638 my ($dat,$tmp)=($dat0,$tmp0);
    639 
    640 ### q8-q15	preloaded key schedule
    641 
    642 $code.=<<___;
    643 .globl	${prefix}_ctr32_encrypt_blocks
    644 .type	${prefix}_ctr32_encrypt_blocks,%function
    645 .align	5
    646 ${prefix}_ctr32_encrypt_blocks:
    647 ___
    648 $code.=<<___	if ($flavour =~ /64/);
    649 	stp		x29,x30,[sp,#-16]!
    650 	add		x29,sp,#0
    651 ___
    652 $code.=<<___	if ($flavour !~ /64/);
    653 	mov		ip,sp
    654 	stmdb		sp!,{r4-r10,lr}
    655 	vstmdb		sp!,{d8-d15}            @ ABI specification says so
    656 	ldr		r4, [ip]		@ load remaining arg
    657 ___
    658 $code.=<<___;
    659 	ldr		$rounds,[$key,#240]
    660 
    661 	ldr		$ctr, [$ivp, #12]
    662 	vld1.32		{$dat0},[$ivp]
    663 
    664 	vld1.32		{q8-q9},[$key]		// load key schedule...
    665 	sub		$rounds,$rounds,#6
    666 	add		$key_,$key,x5,lsl#4	// pointer to last 7 round keys
    667 	sub		$rounds,$rounds,#2
    668 	vld1.32		{q10-q11},[$key_],#32
    669 	vld1.32		{q12-q13},[$key_],#32
    670 	vld1.32		{q14-q15},[$key_],#32
    671 	vld1.32		{$rndlast},[$key_]
    672 
    673 	add		$key_,$key,#32
    674 	mov		$cnt,$rounds
    675 
    676 	subs		$len,$len,#2
    677 	b.lo		.Lctr32_tail
    678 
    679 #ifndef __ARMEB__
    680 	rev		$ctr, $ctr
    681 #endif
    682 	vorr		$dat1,$dat0,$dat0
    683 	add		$ctr, $ctr, #1
    684 	vorr		$ivec,$dat0,$dat0
    685 	rev		$tctr1, $ctr
    686 	cmp		$rounds,#2
    687 	vmov.32		${dat1}[3],$tctr1
    688 	b.eq		.Lctr32_128
    689 
    690 .Loop2x_ctr32:
    691 	aese		$dat0,q8
    692 	aese		$dat1,q8
    693 	vld1.32		{q8},[$key_],#16
    694 	aesmc		$dat0,$dat0
    695 	aesmc		$dat1,$dat1
    696 	subs		$cnt,$cnt,#2
    697 	aese		$dat0,q9
    698 	aese		$dat1,q9
    699 	vld1.32		{q9},[$key_],#16
    700 	aesmc		$dat0,$dat0
    701 	aesmc		$dat1,$dat1
    702 	b.gt		.Loop2x_ctr32
    703 
    704 	aese		$dat0,q8
    705 	aese		$dat1,q8
    706 	aesmc		$tmp0,$dat0
    707 	 vorr		$dat0,$ivec,$ivec
    708 	aesmc		$tmp1,$dat1
    709 	 vorr		$dat1,$ivec,$ivec
    710 	aese		$tmp0,q9
    711 	aese		$tmp1,q9
    712 	 vld1.8		{$in0},[$inp],#16
    713 	aesmc		$tmp0,$tmp0
    714 	 vld1.8		{$in1},[$inp],#16
    715 	aesmc		$tmp1,$tmp1
    716 	 add		$ctr,$ctr,#1
    717 	aese		$tmp0,q10
    718 	aese		$tmp1,q10
    719 	 rev		$tctr,$ctr
    720 	aesmc		$tmp0,$tmp0
    721 	aesmc		$tmp1,$tmp1
    722 	 add		$ctr,$ctr,#1
    723 	aese		$tmp0,q11
    724 	aese		$tmp1,q11
    725 	 veor		$in0,$in0,$rndlast
    726 	 rev		$tctr1,$ctr
    727 	aesmc		$tmp0,$tmp0
    728 	aesmc		$tmp1,$tmp1
    729 	 veor		$in1,$in1,$rndlast
    730 	 mov		$key_,$key
    731 	aese		$tmp0,q12
    732 	aese		$tmp1,q12
    733 	 subs		$len,$len,#2
    734 	aesmc		$tmp0,$tmp0
    735 	aesmc		$tmp1,$tmp1
    736 	 vld1.32	 {q8-q9},[$key_],#32	// re-pre-load rndkey[0-1]
    737 	aese		$tmp0,q13
    738 	aese		$tmp1,q13
    739 	aesmc		$tmp0,$tmp0
    740 	aesmc		$tmp1,$tmp1
    741 	aese		$tmp0,q14
    742 	aese		$tmp1,q14
    743 	 vmov.32	${dat0}[3], $tctr
    744 	aesmc		$tmp0,$tmp0
    745 	 vmov.32	${dat1}[3], $tctr1
    746 	aesmc		$tmp1,$tmp1
    747 	aese		$tmp0,q15
    748 	aese		$tmp1,q15
    749 
    750 	 mov		$cnt,$rounds
    751 	veor		$in0,$in0,$tmp0
    752 	veor		$in1,$in1,$tmp1
    753 	vst1.8		{$in0},[$out],#16
    754 	vst1.8		{$in1},[$out],#16
    755 	b.hs		.Loop2x_ctr32
    756 
    757 	adds		$len,$len,#2
    758 	b.eq		.Lctr32_done
    759 	b		.Lctr32_tail
    760 
    761 .Lctr32_128:
    762 	vld1.32		{$tmp0-$tmp1},[$key_]
    763 
    764 .Loop2x_ctr32_128:
    765 	aese		$dat0,q8
    766 	aese		$dat1,q8
    767 	aesmc		$dat0,$dat0
    768 	 vld1.8		{$in0},[$inp],#16
    769 	aesmc		$dat1,$dat1
    770 	 vld1.8		{$in1},[$inp],#16
    771 	aese		$dat0,q9
    772 	aese		$dat1,q9
    773 	 add		$ctr,$ctr,#1
    774 	aesmc		$dat0,$dat0
    775 	aesmc		$dat1,$dat1
    776 	 rev		$tctr,$ctr
    777 	aese		$dat0,$tmp0
    778 	aese		$dat1,$tmp0
    779 	 add		$ctr,$ctr,#1
    780 	aesmc		$dat0,$dat0
    781 	aesmc		$dat1,$dat1
    782 	 rev		$tctr1,$ctr
    783 	aese		$dat0,$tmp1
    784 	aese		$dat1,$tmp1
    785 	 subs		$len,$len,#2
    786 	aesmc		$dat0,$dat0
    787 	aesmc		$dat1,$dat1
    788 	aese		$dat0,q10
    789 	aese		$dat1,q10
    790 	aesmc		$dat0,$dat0
    791 	aesmc		$dat1,$dat1
    792 	aese		$dat0,q11
    793 	aese		$dat1,q11
    794 	aesmc		$dat0,$dat0
    795 	aesmc		$dat1,$dat1
    796 	aese		$dat0,q12
    797 	aese		$dat1,q12
    798 	aesmc		$dat0,$dat0
    799 	aesmc		$dat1,$dat1
    800 	aese		$dat0,q13
    801 	aese		$dat1,q13
    802 	aesmc		$dat0,$dat0
    803 	aesmc		$dat1,$dat1
    804 	aese		$dat0,q14
    805 	aese		$dat1,q14
    806 	aesmc		$dat0,$dat0
    807 	aesmc		$dat1,$dat1
    808 	 veor		$in0,$in0,$rndlast
    809 	aese		$dat0,q15
    810 	 veor		$in1,$in1,$rndlast
    811 	aese		$dat1,q15
    812 
    813 	veor		$in0,$in0,$dat0
    814 	vorr		$dat0,$ivec,$ivec
    815 	veor		$in1,$in1,$dat1
    816 	vorr		$dat1,$ivec,$ivec
    817 	vst1.8		{$in0},[$out],#16
    818 	vmov.32		${dat0}[3], $tctr
    819 	vst1.8		{$in1},[$out],#16
    820 	vmov.32		${dat1}[3], $tctr1
    821 	b.hs		.Loop2x_ctr32_128
    822 
    823 	adds		$len,$len,#2
    824 	b.eq		.Lctr32_done
    825 
    826 .Lctr32_tail:
    827 	aese		$dat,q8
    828 	vld1.32		{q8},[$key_],#16
    829 	aesmc		$dat,$dat
    830 	subs		$cnt,$cnt,#2
    831 	aese		$dat,q9
    832 	vld1.32		{q9},[$key_],#16
    833 	aesmc		$dat,$dat
    834 	b.gt		.Lctr32_tail
    835 
    836 	aese		$dat,q8
    837 	aesmc		$dat,$dat
    838 	aese		$dat,q9
    839 	aesmc		$dat,$dat
    840 	 vld1.8		{$in0},[$inp]
    841 	aese		$dat,q10
    842 	aesmc		$dat,$dat
    843 	aese		$dat,q11
    844 	aesmc		$dat,$dat
    845 	aese		$dat,q12
    846 	aesmc		$dat,$dat
    847 	aese		$dat,q13
    848 	aesmc		$dat,$dat
    849 	aese		$dat,q14
    850 	aesmc		$dat,$dat
    851 	 veor		$in0,$in0,$rndlast
    852 	aese		$dat,q15
    853 
    854 	veor		$in0,$in0,$dat
    855 	vst1.8		{$in0},[$out]
    856 
    857 .Lctr32_done:
    858 ___
    859 $code.=<<___	if ($flavour !~ /64/);
    860 	vldmia		sp!,{d8-d15}
    861 	ldmia		sp!,{r4-r10,pc}
    862 ___
    863 $code.=<<___	if ($flavour =~ /64/);
    864 	ldr		x29,[sp],#16
    865 	ret
    866 ___
    867 $code.=<<___;
    868 .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
    869 ___
    870 }}}
    871 $code.=<<___;
    872 #endif
    873 ___
    874 ########################################
    875 if ($flavour =~ /64/) {			######## 64-bit code
    876     my %opcode = (
    877 	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
    878 	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
    879 
    880     local *unaes = sub {
    881 	my ($mnemonic,$arg)=@_;
    882 
    883 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
    884 	sprintf ".inst\t0x%08x\t//%s %s",
    885 			$opcode{$mnemonic}|$1|($2<<5),
    886 			$mnemonic,$arg;
    887     };
    888 
    889     foreach(split("\n",$code)) {
    890         s/\`([^\`]*)\`/eval($1)/geo;
    891 
    892 	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
    893         s/@\s/\/\//o;			# old->new style commentary
    894 
    895 	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
    896 	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
    897         s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
    898         s/vext\.8/ext/o		or
    899         s/vrev32\.8/rev32/o	or
    900         s/vtst\.8/cmtst/o	or
    901         s/vshr/ushr/o		or
    902         s/^(\s+)v/$1/o		or	# strip off v prefix
    903 	s/\bbx\s+lr\b/ret/o;
    904 
    905 	# fix up remainig legacy suffixes
    906 	s/\.[ui]?8//o;
    907 	m/\],#8/o and s/\.16b/\.8b/go;
    908         s/\.[ui]?32//o and s/\.16b/\.4s/go;
    909         s/\.[ui]?64//o and s/\.16b/\.2d/go;
    910 	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
    911 
    912         print $_,"\n";
    913     }
    914 } else {				######## 32-bit code
    915     my %opcode = (
    916 	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
    917 	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
    918 
    919     local *unaes = sub {
    920 	my ($mnemonic,$arg)=@_;
    921 
    922 	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
    923 	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
    924 					 |(($2&7)<<1) |(($2&8)<<2);
    925 	    # since ARMv7 instructions are always encoded little-endian.
    926 	    # correct solution is to use .inst directive, but older
    927 	    # assemblers don't implement it:-(
    928 	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
    929 			$word&0xff,($word>>8)&0xff,
    930 			($word>>16)&0xff,($word>>24)&0xff,
    931 			$mnemonic,$arg;
    932 	}
    933     };
    934 
    935     sub unvtbl {
    936 	my $arg=shift;
    937 
    938 	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
    939 	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
    940 		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;	
    941     }
    942 
    943     sub unvdup32 {
    944 	my $arg=shift;
    945 
    946 	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
    947 	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;	
    948     }
    949 
    950     sub unvmov32 {
    951 	my $arg=shift;
    952 
    953 	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
    954 	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;	
    955     }
    956 
    957     foreach(split("\n",$code)) {
    958         s/\`([^\`]*)\`/eval($1)/geo;
    959 
    960 	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
    961 	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
    962         s/\/\/\s?/@ /o;				# new->old style commentary
    963 
    964 	# fix up remainig new-style suffixes
    965 	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
    966 	s/\],#[0-9]+/]!/o;
    967 
    968 	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
    969 	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
    970 	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
    971 	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
    972 	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
    973 	s/^(\s+)b\./$1b/o				or
    974 	s/^(\s+)ret/$1bx\tlr/o;
    975 
    976         print $_,"\n";
    977     }
    978 }
    979 
    980 close STDOUT;
    981