Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 #
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 #
     17 # This module implements support for ARMv8 AES instructions. The
     18 # module is endian-agnostic in sense that it supports both big- and
     19 # little-endian cases. As does it support both 32- and 64-bit modes
     20 # of operation. Latter is achieved by limiting amount of utilized
     21 # registers to 16, which implies additional NEON load and integer
     22 # instructions. This has no effect on mighty Apple A7, where results
     23 # are literally equal to the theoretical estimates based on AES
     24 # instruction latencies and issue rates. On Cortex-A53, an in-order
     25 # execution core, this costs up to 10-15%, which is partially
     26 # compensated by implementing dedicated code path for 128-bit
     27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
     28 # seems to be limited by sheer amount of NEON instructions...
     29 #
     30 # Performance in cycles per byte processed with 128-bit key:
     31 #
     32 #		CBC enc		CBC dec		CTR
     33 # Apple A7	2.39		1.20		1.20
     34 # Cortex-A53	1.32		1.29		1.46
     35 # Cortex-A57(*)	1.95		0.85		0.93
     36 # Denver	1.96		0.86		0.80
     37 # Mongoose	1.33		1.20		1.20
     38 #
     39 # (*)	original 3.64/1.34/1.32 results were for r0p0 revision
     40 #	and are still same even for updated module;
     41 
     42 $flavour = shift;
     43 $output  = shift;
     44 
     45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     47 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
     48 die "can't locate arm-xlate.pl";
     49 
     50 open OUT,"| \"$^X\" $xlate $flavour $output";
     51 *STDOUT=*OUT;
     52 
     53 $prefix="aes_hw";
     54 
     55 $code=<<___;
     56 #include <openssl/arm_arch.h>
     57 
     58 #if __ARM_MAX_ARCH__>=7
     59 .text
     60 ___
     61 $code.=<<___ if ($flavour =~ /64/);
     62 #if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
     63 .arch  armv8-a+crypto
     64 #endif
     65 ___
     66 $code.=<<___						if ($flavour !~ /64/);
     67 .arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
     68 .fpu	neon
     69 .code	32
     70 #undef	__thumb2__
     71 ___
     72 
     73 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
     74 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
     75 # maintain both 32- and 64-bit codes within single module and
     76 # transliterate common code to either flavour with regex vodoo.
     77 #
     78 {{{
     79 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
     80 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
     81 	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
     82 
     83 
     84 $code.=<<___;
     85 .align	5
     86 .Lrcon:
     87 .long	0x01,0x01,0x01,0x01
     88 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
     89 .long	0x1b,0x1b,0x1b,0x1b
     90 
     91 .globl	${prefix}_set_encrypt_key
     92 .type	${prefix}_set_encrypt_key,%function
     93 .align	5
     94 ${prefix}_set_encrypt_key:
     95 .Lenc_key:
     96 ___
     97 $code.=<<___	if ($flavour =~ /64/);
     98 	stp	x29,x30,[sp,#-16]!
     99 	add	x29,sp,#0
    100 ___
    101 $code.=<<___;
    102 	mov	$ptr,#-1
    103 	cmp	$inp,#0
    104 	b.eq	.Lenc_key_abort
    105 	cmp	$out,#0
    106 	b.eq	.Lenc_key_abort
    107 	mov	$ptr,#-2
    108 	cmp	$bits,#128
    109 	b.lt	.Lenc_key_abort
    110 	cmp	$bits,#256
    111 	b.gt	.Lenc_key_abort
    112 	tst	$bits,#0x3f
    113 	b.ne	.Lenc_key_abort
    114 
    115 	adr	$ptr,.Lrcon
    116 	cmp	$bits,#192
    117 
    118 	veor	$zero,$zero,$zero
    119 	vld1.8	{$in0},[$inp],#16
    120 	mov	$bits,#8		// reuse $bits
    121 	vld1.32	{$rcon,$mask},[$ptr],#32
    122 
    123 	b.lt	.Loop128
    124 	b.eq	.L192
    125 	b	.L256
    126 
    127 .align	4
    128 .Loop128:
    129 	vtbl.8	$key,{$in0},$mask
    130 	vext.8	$tmp,$zero,$in0,#12
    131 	vst1.32	{$in0},[$out],#16
    132 	aese	$key,$zero
    133 	subs	$bits,$bits,#1
    134 
    135 	veor	$in0,$in0,$tmp
    136 	vext.8	$tmp,$zero,$tmp,#12
    137 	veor	$in0,$in0,$tmp
    138 	vext.8	$tmp,$zero,$tmp,#12
    139 	 veor	$key,$key,$rcon
    140 	veor	$in0,$in0,$tmp
    141 	vshl.u8	$rcon,$rcon,#1
    142 	veor	$in0,$in0,$key
    143 	b.ne	.Loop128
    144 
    145 	vld1.32	{$rcon},[$ptr]
    146 
    147 	vtbl.8	$key,{$in0},$mask
    148 	vext.8	$tmp,$zero,$in0,#12
    149 	vst1.32	{$in0},[$out],#16
    150 	aese	$key,$zero
    151 
    152 	veor	$in0,$in0,$tmp
    153 	vext.8	$tmp,$zero,$tmp,#12
    154 	veor	$in0,$in0,$tmp
    155 	vext.8	$tmp,$zero,$tmp,#12
    156 	 veor	$key,$key,$rcon
    157 	veor	$in0,$in0,$tmp
    158 	vshl.u8	$rcon,$rcon,#1
    159 	veor	$in0,$in0,$key
    160 
    161 	vtbl.8	$key,{$in0},$mask
    162 	vext.8	$tmp,$zero,$in0,#12
    163 	vst1.32	{$in0},[$out],#16
    164 	aese	$key,$zero
    165 
    166 	veor	$in0,$in0,$tmp
    167 	vext.8	$tmp,$zero,$tmp,#12
    168 	veor	$in0,$in0,$tmp
    169 	vext.8	$tmp,$zero,$tmp,#12
    170 	 veor	$key,$key,$rcon
    171 	veor	$in0,$in0,$tmp
    172 	veor	$in0,$in0,$key
    173 	vst1.32	{$in0},[$out]
    174 	add	$out,$out,#0x50
    175 
    176 	mov	$rounds,#10
    177 	b	.Ldone
    178 
    179 .align	4
    180 .L192:
    181 	vld1.8	{$in1},[$inp],#8
    182 	vmov.i8	$key,#8			// borrow $key
    183 	vst1.32	{$in0},[$out],#16
    184 	vsub.i8	$mask,$mask,$key	// adjust the mask
    185 
    186 .Loop192:
    187 	vtbl.8	$key,{$in1},$mask
    188 	vext.8	$tmp,$zero,$in0,#12
    189 	vst1.32	{$in1},[$out],#8
    190 	aese	$key,$zero
    191 	subs	$bits,$bits,#1
    192 
    193 	veor	$in0,$in0,$tmp
    194 	vext.8	$tmp,$zero,$tmp,#12
    195 	veor	$in0,$in0,$tmp
    196 	vext.8	$tmp,$zero,$tmp,#12
    197 	veor	$in0,$in0,$tmp
    198 
    199 	vdup.32	$tmp,${in0}[3]
    200 	veor	$tmp,$tmp,$in1
    201 	 veor	$key,$key,$rcon
    202 	vext.8	$in1,$zero,$in1,#12
    203 	vshl.u8	$rcon,$rcon,#1
    204 	veor	$in1,$in1,$tmp
    205 	veor	$in0,$in0,$key
    206 	veor	$in1,$in1,$key
    207 	vst1.32	{$in0},[$out],#16
    208 	b.ne	.Loop192
    209 
    210 	mov	$rounds,#12
    211 	add	$out,$out,#0x20
    212 	b	.Ldone
    213 
    214 .align	4
    215 .L256:
    216 	vld1.8	{$in1},[$inp]
    217 	mov	$bits,#7
    218 	mov	$rounds,#14
    219 	vst1.32	{$in0},[$out],#16
    220 
    221 .Loop256:
    222 	vtbl.8	$key,{$in1},$mask
    223 	vext.8	$tmp,$zero,$in0,#12
    224 	vst1.32	{$in1},[$out],#16
    225 	aese	$key,$zero
    226 	subs	$bits,$bits,#1
    227 
    228 	veor	$in0,$in0,$tmp
    229 	vext.8	$tmp,$zero,$tmp,#12
    230 	veor	$in0,$in0,$tmp
    231 	vext.8	$tmp,$zero,$tmp,#12
    232 	 veor	$key,$key,$rcon
    233 	veor	$in0,$in0,$tmp
    234 	vshl.u8	$rcon,$rcon,#1
    235 	veor	$in0,$in0,$key
    236 	vst1.32	{$in0},[$out],#16
    237 	b.eq	.Ldone
    238 
    239 	vdup.32	$key,${in0}[3]		// just splat
    240 	vext.8	$tmp,$zero,$in1,#12
    241 	aese	$key,$zero
    242 
    243 	veor	$in1,$in1,$tmp
    244 	vext.8	$tmp,$zero,$tmp,#12
    245 	veor	$in1,$in1,$tmp
    246 	vext.8	$tmp,$zero,$tmp,#12
    247 	veor	$in1,$in1,$tmp
    248 
    249 	veor	$in1,$in1,$key
    250 	b	.Loop256
    251 
    252 .Ldone:
    253 	str	$rounds,[$out]
    254 	mov	$ptr,#0
    255 
    256 .Lenc_key_abort:
    257 	mov	x0,$ptr			// return value
    258 	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
    259 	ret
    260 .size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
    261 
    262 .globl	${prefix}_set_decrypt_key
    263 .type	${prefix}_set_decrypt_key,%function
    264 .align	5
    265 ${prefix}_set_decrypt_key:
    266 ___
    267 $code.=<<___	if ($flavour =~ /64/);
    268 	stp	x29,x30,[sp,#-16]!
    269 	add	x29,sp,#0
    270 ___
    271 $code.=<<___	if ($flavour !~ /64/);
    272 	stmdb	sp!,{r4,lr}
    273 ___
    274 $code.=<<___;
    275 	bl	.Lenc_key
    276 
    277 	cmp	x0,#0
    278 	b.ne	.Ldec_key_abort
    279 
    280 	sub	$out,$out,#240		// restore original $out
    281 	mov	x4,#-16
    282 	add	$inp,$out,x12,lsl#4	// end of key schedule
    283 
    284 	vld1.32	{v0.16b},[$out]
    285 	vld1.32	{v1.16b},[$inp]
    286 	vst1.32	{v0.16b},[$inp],x4
    287 	vst1.32	{v1.16b},[$out],#16
    288 
    289 .Loop_imc:
    290 	vld1.32	{v0.16b},[$out]
    291 	vld1.32	{v1.16b},[$inp]
    292 	aesimc	v0.16b,v0.16b
    293 	aesimc	v1.16b,v1.16b
    294 	vst1.32	{v0.16b},[$inp],x4
    295 	vst1.32	{v1.16b},[$out],#16
    296 	cmp	$inp,$out
    297 	b.hi	.Loop_imc
    298 
    299 	vld1.32	{v0.16b},[$out]
    300 	aesimc	v0.16b,v0.16b
    301 	vst1.32	{v0.16b},[$inp]
    302 
    303 	eor	x0,x0,x0		// return value
    304 .Ldec_key_abort:
    305 ___
    306 $code.=<<___	if ($flavour !~ /64/);
    307 	ldmia	sp!,{r4,pc}
    308 ___
    309 $code.=<<___	if ($flavour =~ /64/);
    310 	ldp	x29,x30,[sp],#16
    311 	ret
    312 ___
    313 $code.=<<___;
    314 .size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
    315 ___
    316 }}}
    317 {{{
    318 sub gen_block () {
    319 my $dir = shift;
    320 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
    321 my ($inp,$out,$key)=map("x$_",(0..2));
    322 my $rounds="w3";
    323 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
    324 
    325 $code.=<<___;
    326 .globl	${prefix}_${dir}crypt
    327 .type	${prefix}_${dir}crypt,%function
    328 .align	5
    329 ${prefix}_${dir}crypt:
    330 	ldr	$rounds,[$key,#240]
    331 	vld1.32	{$rndkey0},[$key],#16
    332 	vld1.8	{$inout},[$inp]
    333 	sub	$rounds,$rounds,#2
    334 	vld1.32	{$rndkey1},[$key],#16
    335 
    336 .Loop_${dir}c:
    337 	aes$e	$inout,$rndkey0
    338 	aes$mc	$inout,$inout
    339 	vld1.32	{$rndkey0},[$key],#16
    340 	subs	$rounds,$rounds,#2
    341 	aes$e	$inout,$rndkey1
    342 	aes$mc	$inout,$inout
    343 	vld1.32	{$rndkey1},[$key],#16
    344 	b.gt	.Loop_${dir}c
    345 
    346 	aes$e	$inout,$rndkey0
    347 	aes$mc	$inout,$inout
    348 	vld1.32	{$rndkey0},[$key]
    349 	aes$e	$inout,$rndkey1
    350 	veor	$inout,$inout,$rndkey0
    351 
    352 	vst1.8	{$inout},[$out]
    353 	ret
    354 .size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
    355 ___
    356 }
    357 &gen_block("en");
    358 &gen_block("de");
    359 }}}
    360 {{{
    361 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
    362 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
    363 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
    364 
    365 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
    366 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
    367 
    368 ### q8-q15	preloaded key schedule
    369 
    370 $code.=<<___;
    371 .globl	${prefix}_cbc_encrypt
    372 .type	${prefix}_cbc_encrypt,%function
    373 .align	5
    374 ${prefix}_cbc_encrypt:
    375 ___
    376 $code.=<<___	if ($flavour =~ /64/);
    377 	stp	x29,x30,[sp,#-16]!
    378 	add	x29,sp,#0
    379 ___
    380 $code.=<<___	if ($flavour !~ /64/);
    381 	mov	ip,sp
    382 	stmdb	sp!,{r4-r8,lr}
    383 	vstmdb	sp!,{d8-d15}            @ ABI specification says so
    384 	ldmia	ip,{r4-r5}		@ load remaining args
    385 ___
    386 $code.=<<___;
    387 	subs	$len,$len,#16
    388 	mov	$step,#16
    389 	b.lo	.Lcbc_abort
    390 	cclr	$step,eq
    391 
    392 	cmp	$enc,#0			// en- or decrypting?
    393 	ldr	$rounds,[$key,#240]
    394 	and	$len,$len,#-16
    395 	vld1.8	{$ivec},[$ivp]
    396 	vld1.8	{$dat},[$inp],$step
    397 
    398 	vld1.32	{q8-q9},[$key]		// load key schedule...
    399 	sub	$rounds,$rounds,#6
    400 	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
    401 	sub	$rounds,$rounds,#2
    402 	vld1.32	{q10-q11},[$key_],#32
    403 	vld1.32	{q12-q13},[$key_],#32
    404 	vld1.32	{q14-q15},[$key_],#32
    405 	vld1.32	{$rndlast},[$key_]
    406 
    407 	add	$key_,$key,#32
    408 	mov	$cnt,$rounds
    409 	b.eq	.Lcbc_dec
    410 
    411 	cmp	$rounds,#2
    412 	veor	$dat,$dat,$ivec
    413 	veor	$rndzero_n_last,q8,$rndlast
    414 	b.eq	.Lcbc_enc128
    415 
    416 	vld1.32	{$in0-$in1},[$key_]
    417 	add	$key_,$key,#16
    418 	add	$key4,$key,#16*4
    419 	add	$key5,$key,#16*5
    420 	aese	$dat,q8
    421 	aesmc	$dat,$dat
    422 	add	$key6,$key,#16*6
    423 	add	$key7,$key,#16*7
    424 	b	.Lenter_cbc_enc
    425 
    426 .align	4
    427 .Loop_cbc_enc:
    428 	aese	$dat,q8
    429 	aesmc	$dat,$dat
    430 	 vst1.8	{$ivec},[$out],#16
    431 .Lenter_cbc_enc:
    432 	aese	$dat,q9
    433 	aesmc	$dat,$dat
    434 	aese	$dat,$in0
    435 	aesmc	$dat,$dat
    436 	vld1.32	{q8},[$key4]
    437 	cmp	$rounds,#4
    438 	aese	$dat,$in1
    439 	aesmc	$dat,$dat
    440 	vld1.32	{q9},[$key5]
    441 	b.eq	.Lcbc_enc192
    442 
    443 	aese	$dat,q8
    444 	aesmc	$dat,$dat
    445 	vld1.32	{q8},[$key6]
    446 	aese	$dat,q9
    447 	aesmc	$dat,$dat
    448 	vld1.32	{q9},[$key7]
    449 	nop
    450 
    451 .Lcbc_enc192:
    452 	aese	$dat,q8
    453 	aesmc	$dat,$dat
    454 	 subs	$len,$len,#16
    455 	aese	$dat,q9
    456 	aesmc	$dat,$dat
    457 	 cclr	$step,eq
    458 	aese	$dat,q10
    459 	aesmc	$dat,$dat
    460 	aese	$dat,q11
    461 	aesmc	$dat,$dat
    462 	 vld1.8	{q8},[$inp],$step
    463 	aese	$dat,q12
    464 	aesmc	$dat,$dat
    465 	 veor	q8,q8,$rndzero_n_last
    466 	aese	$dat,q13
    467 	aesmc	$dat,$dat
    468 	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
    469 	aese	$dat,q14
    470 	aesmc	$dat,$dat
    471 	aese	$dat,q15
    472 	veor	$ivec,$dat,$rndlast
    473 	b.hs	.Loop_cbc_enc
    474 
    475 	vst1.8	{$ivec},[$out],#16
    476 	b	.Lcbc_done
    477 
    478 .align	5
    479 .Lcbc_enc128:
    480 	vld1.32	{$in0-$in1},[$key_]
    481 	aese	$dat,q8
    482 	aesmc	$dat,$dat
    483 	b	.Lenter_cbc_enc128
    484 .Loop_cbc_enc128:
    485 	aese	$dat,q8
    486 	aesmc	$dat,$dat
    487 	 vst1.8	{$ivec},[$out],#16
    488 .Lenter_cbc_enc128:
    489 	aese	$dat,q9
    490 	aesmc	$dat,$dat
    491 	 subs	$len,$len,#16
    492 	aese	$dat,$in0
    493 	aesmc	$dat,$dat
    494 	 cclr	$step,eq
    495 	aese	$dat,$in1
    496 	aesmc	$dat,$dat
    497 	aese	$dat,q10
    498 	aesmc	$dat,$dat
    499 	aese	$dat,q11
    500 	aesmc	$dat,$dat
    501 	 vld1.8	{q8},[$inp],$step
    502 	aese	$dat,q12
    503 	aesmc	$dat,$dat
    504 	aese	$dat,q13
    505 	aesmc	$dat,$dat
    506 	aese	$dat,q14
    507 	aesmc	$dat,$dat
    508 	 veor	q8,q8,$rndzero_n_last
    509 	aese	$dat,q15
    510 	veor	$ivec,$dat,$rndlast
    511 	b.hs	.Loop_cbc_enc128
    512 
    513 	vst1.8	{$ivec},[$out],#16
    514 	b	.Lcbc_done
    515 ___
    516 {
    517 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
    518 $code.=<<___;
    519 .align	5
    520 .Lcbc_dec:
    521 	vld1.8	{$dat2},[$inp],#16
    522 	subs	$len,$len,#32		// bias
    523 	add	$cnt,$rounds,#2
    524 	vorr	$in1,$dat,$dat
    525 	vorr	$dat1,$dat,$dat
    526 	vorr	$in2,$dat2,$dat2
    527 	b.lo	.Lcbc_dec_tail
    528 
    529 	vorr	$dat1,$dat2,$dat2
    530 	vld1.8	{$dat2},[$inp],#16
    531 	vorr	$in0,$dat,$dat
    532 	vorr	$in1,$dat1,$dat1
    533 	vorr	$in2,$dat2,$dat2
    534 
    535 .Loop3x_cbc_dec:
    536 	aesd	$dat0,q8
    537 	aesimc	$dat0,$dat0
    538 	aesd	$dat1,q8
    539 	aesimc	$dat1,$dat1
    540 	aesd	$dat2,q8
    541 	aesimc	$dat2,$dat2
    542 	vld1.32	{q8},[$key_],#16
    543 	subs	$cnt,$cnt,#2
    544 	aesd	$dat0,q9
    545 	aesimc	$dat0,$dat0
    546 	aesd	$dat1,q9
    547 	aesimc	$dat1,$dat1
    548 	aesd	$dat2,q9
    549 	aesimc	$dat2,$dat2
    550 	vld1.32	{q9},[$key_],#16
    551 	b.gt	.Loop3x_cbc_dec
    552 
    553 	aesd	$dat0,q8
    554 	aesimc	$dat0,$dat0
    555 	aesd	$dat1,q8
    556 	aesimc	$dat1,$dat1
    557 	aesd	$dat2,q8
    558 	aesimc	$dat2,$dat2
    559 	 veor	$tmp0,$ivec,$rndlast
    560 	 subs	$len,$len,#0x30
    561 	 veor	$tmp1,$in0,$rndlast
    562 	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
    563 	aesd	$dat0,q9
    564 	aesimc	$dat0,$dat0
    565 	aesd	$dat1,q9
    566 	aesimc	$dat1,$dat1
    567 	aesd	$dat2,q9
    568 	aesimc	$dat2,$dat2
    569 	 veor	$tmp2,$in1,$rndlast
    570 	 add	$inp,$inp,x6		// $inp is adjusted in such way that
    571 					// at exit from the loop $dat1-$dat2
    572 					// are loaded with last "words"
    573 	 vorr	$ivec,$in2,$in2
    574 	 mov	$key_,$key
    575 	aesd	$dat0,q12
    576 	aesimc	$dat0,$dat0
    577 	aesd	$dat1,q12
    578 	aesimc	$dat1,$dat1
    579 	aesd	$dat2,q12
    580 	aesimc	$dat2,$dat2
    581 	 vld1.8	{$in0},[$inp],#16
    582 	aesd	$dat0,q13
    583 	aesimc	$dat0,$dat0
    584 	aesd	$dat1,q13
    585 	aesimc	$dat1,$dat1
    586 	aesd	$dat2,q13
    587 	aesimc	$dat2,$dat2
    588 	 vld1.8	{$in1},[$inp],#16
    589 	aesd	$dat0,q14
    590 	aesimc	$dat0,$dat0
    591 	aesd	$dat1,q14
    592 	aesimc	$dat1,$dat1
    593 	aesd	$dat2,q14
    594 	aesimc	$dat2,$dat2
    595 	 vld1.8	{$in2},[$inp],#16
    596 	aesd	$dat0,q15
    597 	aesd	$dat1,q15
    598 	aesd	$dat2,q15
    599 	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
    600 	 add	$cnt,$rounds,#2
    601 	veor	$tmp0,$tmp0,$dat0
    602 	veor	$tmp1,$tmp1,$dat1
    603 	veor	$dat2,$dat2,$tmp2
    604 	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
    605 	vst1.8	{$tmp0},[$out],#16
    606 	 vorr	$dat0,$in0,$in0
    607 	vst1.8	{$tmp1},[$out],#16
    608 	 vorr	$dat1,$in1,$in1
    609 	vst1.8	{$dat2},[$out],#16
    610 	 vorr	$dat2,$in2,$in2
    611 	b.hs	.Loop3x_cbc_dec
    612 
    613 	cmn	$len,#0x30
    614 	b.eq	.Lcbc_done
    615 	nop
    616 
    617 .Lcbc_dec_tail:
    618 	aesd	$dat1,q8
    619 	aesimc	$dat1,$dat1
    620 	aesd	$dat2,q8
    621 	aesimc	$dat2,$dat2
    622 	vld1.32	{q8},[$key_],#16
    623 	subs	$cnt,$cnt,#2
    624 	aesd	$dat1,q9
    625 	aesimc	$dat1,$dat1
    626 	aesd	$dat2,q9
    627 	aesimc	$dat2,$dat2
    628 	vld1.32	{q9},[$key_],#16
    629 	b.gt	.Lcbc_dec_tail
    630 
    631 	aesd	$dat1,q8
    632 	aesimc	$dat1,$dat1
    633 	aesd	$dat2,q8
    634 	aesimc	$dat2,$dat2
    635 	aesd	$dat1,q9
    636 	aesimc	$dat1,$dat1
    637 	aesd	$dat2,q9
    638 	aesimc	$dat2,$dat2
    639 	aesd	$dat1,q12
    640 	aesimc	$dat1,$dat1
    641 	aesd	$dat2,q12
    642 	aesimc	$dat2,$dat2
    643 	 cmn	$len,#0x20
    644 	aesd	$dat1,q13
    645 	aesimc	$dat1,$dat1
    646 	aesd	$dat2,q13
    647 	aesimc	$dat2,$dat2
    648 	 veor	$tmp1,$ivec,$rndlast
    649 	aesd	$dat1,q14
    650 	aesimc	$dat1,$dat1
    651 	aesd	$dat2,q14
    652 	aesimc	$dat2,$dat2
    653 	 veor	$tmp2,$in1,$rndlast
    654 	aesd	$dat1,q15
    655 	aesd	$dat2,q15
    656 	b.eq	.Lcbc_dec_one
    657 	veor	$tmp1,$tmp1,$dat1
    658 	veor	$tmp2,$tmp2,$dat2
    659 	 vorr	$ivec,$in2,$in2
    660 	vst1.8	{$tmp1},[$out],#16
    661 	vst1.8	{$tmp2},[$out],#16
    662 	b	.Lcbc_done
    663 
    664 .Lcbc_dec_one:
    665 	veor	$tmp1,$tmp1,$dat2
    666 	 vorr	$ivec,$in2,$in2
    667 	vst1.8	{$tmp1},[$out],#16
    668 
    669 .Lcbc_done:
    670 	vst1.8	{$ivec},[$ivp]
    671 .Lcbc_abort:
    672 ___
    673 }
    674 $code.=<<___	if ($flavour !~ /64/);
    675 	vldmia	sp!,{d8-d15}
    676 	ldmia	sp!,{r4-r8,pc}
    677 ___
    678 $code.=<<___	if ($flavour =~ /64/);
    679 	ldr	x29,[sp],#16
    680 	ret
    681 ___
    682 $code.=<<___;
    683 .size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
    684 ___
    685 }}}
    686 {{{
    687 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
    688 my ($rounds,$cnt,$key_)=("w5","w6","x7");
    689 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
    690 my $step="x12";		# aliases with $tctr2
    691 
    692 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
    693 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
    694 
    695 my ($dat,$tmp)=($dat0,$tmp0);
    696 
    697 ### q8-q15	preloaded key schedule
    698 
    699 $code.=<<___;
    700 .globl	${prefix}_ctr32_encrypt_blocks
    701 .type	${prefix}_ctr32_encrypt_blocks,%function
    702 .align	5
    703 ${prefix}_ctr32_encrypt_blocks:
    704 ___
    705 $code.=<<___	if ($flavour =~ /64/);
    706 	stp		x29,x30,[sp,#-16]!
    707 	add		x29,sp,#0
    708 ___
    709 $code.=<<___	if ($flavour !~ /64/);
    710 	mov		ip,sp
    711 	stmdb		sp!,{r4-r10,lr}
    712 	vstmdb		sp!,{d8-d15}            @ ABI specification says so
    713 	ldr		r4, [ip]		@ load remaining arg
    714 ___
    715 $code.=<<___;
    716 	ldr		$rounds,[$key,#240]
    717 
    718 	ldr		$ctr, [$ivp, #12]
    719 	vld1.32		{$dat0},[$ivp]
    720 
    721 	vld1.32		{q8-q9},[$key]		// load key schedule...
    722 	sub		$rounds,$rounds,#4
    723 	mov		$step,#16
    724 	cmp		$len,#2
    725 	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
    726 	sub		$rounds,$rounds,#2
    727 	vld1.32		{q12-q13},[$key_],#32
    728 	vld1.32		{q14-q15},[$key_],#32
    729 	vld1.32		{$rndlast},[$key_]
    730 	add		$key_,$key,#32
    731 	mov		$cnt,$rounds
    732 	cclr		$step,lo
    733 #ifndef __ARMEB__
    734 	rev		$ctr, $ctr
    735 #endif
    736 	vorr		$dat1,$dat0,$dat0
    737 	add		$tctr1, $ctr, #1
    738 	vorr		$dat2,$dat0,$dat0
    739 	add		$ctr, $ctr, #2
    740 	vorr		$ivec,$dat0,$dat0
    741 	rev		$tctr1, $tctr1
    742 	vmov.32		${dat1}[3],$tctr1
    743 	b.ls		.Lctr32_tail
    744 	rev		$tctr2, $ctr
    745 	sub		$len,$len,#3		// bias
    746 	vmov.32		${dat2}[3],$tctr2
    747 	b		.Loop3x_ctr32
    748 
    749 .align	4
    750 .Loop3x_ctr32:
    751 	aese		$dat0,q8
    752 	aesmc		$dat0,$dat0
    753 	aese		$dat1,q8
    754 	aesmc		$dat1,$dat1
    755 	aese		$dat2,q8
    756 	aesmc		$dat2,$dat2
    757 	vld1.32		{q8},[$key_],#16
    758 	subs		$cnt,$cnt,#2
    759 	aese		$dat0,q9
    760 	aesmc		$dat0,$dat0
    761 	aese		$dat1,q9
    762 	aesmc		$dat1,$dat1
    763 	aese		$dat2,q9
    764 	aesmc		$dat2,$dat2
    765 	vld1.32		{q9},[$key_],#16
    766 	b.gt		.Loop3x_ctr32
    767 
    768 	aese		$dat0,q8
    769 	aesmc		$tmp0,$dat0
    770 	aese		$dat1,q8
    771 	aesmc		$tmp1,$dat1
    772 	 vld1.8		{$in0},[$inp],#16
    773 	 vorr		$dat0,$ivec,$ivec
    774 	aese		$dat2,q8
    775 	aesmc		$dat2,$dat2
    776 	 vld1.8		{$in1},[$inp],#16
    777 	 vorr		$dat1,$ivec,$ivec
    778 	aese		$tmp0,q9
    779 	aesmc		$tmp0,$tmp0
    780 	aese		$tmp1,q9
    781 	aesmc		$tmp1,$tmp1
    782 	 vld1.8		{$in2},[$inp],#16
    783 	 mov		$key_,$key
    784 	aese		$dat2,q9
    785 	aesmc		$tmp2,$dat2
    786 	 vorr		$dat2,$ivec,$ivec
    787 	 add		$tctr0,$ctr,#1
    788 	aese		$tmp0,q12
    789 	aesmc		$tmp0,$tmp0
    790 	aese		$tmp1,q12
    791 	aesmc		$tmp1,$tmp1
    792 	 veor		$in0,$in0,$rndlast
    793 	 add		$tctr1,$ctr,#2
    794 	aese		$tmp2,q12
    795 	aesmc		$tmp2,$tmp2
    796 	 veor		$in1,$in1,$rndlast
    797 	 add		$ctr,$ctr,#3
    798 	aese		$tmp0,q13
    799 	aesmc		$tmp0,$tmp0
    800 	aese		$tmp1,q13
    801 	aesmc		$tmp1,$tmp1
    802 	 veor		$in2,$in2,$rndlast
    803 	 rev		$tctr0,$tctr0
    804 	aese		$tmp2,q13
    805 	aesmc		$tmp2,$tmp2
    806 	 vmov.32	${dat0}[3], $tctr0
    807 	 rev		$tctr1,$tctr1
    808 	aese		$tmp0,q14
    809 	aesmc		$tmp0,$tmp0
    810 	aese		$tmp1,q14
    811 	aesmc		$tmp1,$tmp1
    812 	 vmov.32	${dat1}[3], $tctr1
    813 	 rev		$tctr2,$ctr
    814 	aese		$tmp2,q14
    815 	aesmc		$tmp2,$tmp2
    816 	 vmov.32	${dat2}[3], $tctr2
    817 	 subs		$len,$len,#3
    818 	aese		$tmp0,q15
    819 	aese		$tmp1,q15
    820 	aese		$tmp2,q15
    821 
    822 	veor		$in0,$in0,$tmp0
    823 	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
    824 	vst1.8		{$in0},[$out],#16
    825 	veor		$in1,$in1,$tmp1
    826 	 mov		$cnt,$rounds
    827 	vst1.8		{$in1},[$out],#16
    828 	veor		$in2,$in2,$tmp2
    829 	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
    830 	vst1.8		{$in2},[$out],#16
    831 	b.hs		.Loop3x_ctr32
    832 
    833 	adds		$len,$len,#3
    834 	b.eq		.Lctr32_done
    835 	cmp		$len,#1
    836 	mov		$step,#16
    837 	cclr		$step,eq
    838 
    839 .Lctr32_tail:
    840 	aese		$dat0,q8
    841 	aesmc		$dat0,$dat0
    842 	aese		$dat1,q8
    843 	aesmc		$dat1,$dat1
    844 	vld1.32		{q8},[$key_],#16
    845 	subs		$cnt,$cnt,#2
    846 	aese		$dat0,q9
    847 	aesmc		$dat0,$dat0
    848 	aese		$dat1,q9
    849 	aesmc		$dat1,$dat1
    850 	vld1.32		{q9},[$key_],#16
    851 	b.gt		.Lctr32_tail
    852 
    853 	aese		$dat0,q8
    854 	aesmc		$dat0,$dat0
    855 	aese		$dat1,q8
    856 	aesmc		$dat1,$dat1
    857 	aese		$dat0,q9
    858 	aesmc		$dat0,$dat0
    859 	aese		$dat1,q9
    860 	aesmc		$dat1,$dat1
    861 	 vld1.8		{$in0},[$inp],$step
    862 	aese		$dat0,q12
    863 	aesmc		$dat0,$dat0
    864 	aese		$dat1,q12
    865 	aesmc		$dat1,$dat1
    866 	 vld1.8		{$in1},[$inp]
    867 	aese		$dat0,q13
    868 	aesmc		$dat0,$dat0
    869 	aese		$dat1,q13
    870 	aesmc		$dat1,$dat1
    871 	 veor		$in0,$in0,$rndlast
    872 	aese		$dat0,q14
    873 	aesmc		$dat0,$dat0
    874 	aese		$dat1,q14
    875 	aesmc		$dat1,$dat1
    876 	 veor		$in1,$in1,$rndlast
    877 	aese		$dat0,q15
    878 	aese		$dat1,q15
    879 
    880 	cmp		$len,#1
    881 	veor		$in0,$in0,$dat0
    882 	veor		$in1,$in1,$dat1
    883 	vst1.8		{$in0},[$out],#16
    884 	b.eq		.Lctr32_done
    885 	vst1.8		{$in1},[$out]
    886 
    887 .Lctr32_done:
    888 ___
    889 $code.=<<___	if ($flavour !~ /64/);
    890 	vldmia		sp!,{d8-d15}
    891 	ldmia		sp!,{r4-r10,pc}
    892 ___
    893 $code.=<<___	if ($flavour =~ /64/);
    894 	ldr		x29,[sp],#16
    895 	ret
    896 ___
    897 $code.=<<___;
    898 .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
    899 ___
    900 }}}
    901 $code.=<<___;
    902 #endif
    903 ___
    904 ########################################
    905 if ($flavour =~ /64/) {			######## 64-bit code
    906     my %opcode = (
    907 	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
    908 	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
    909 
    910     local *unaes = sub {
    911 	my ($mnemonic,$arg)=@_;
    912 
    913 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
    914 	sprintf ".inst\t0x%08x\t//%s %s",
    915 			$opcode{$mnemonic}|$1|($2<<5),
    916 			$mnemonic,$arg;
    917     };
    918 
    919     foreach(split("\n",$code)) {
    920 	s/\`([^\`]*)\`/eval($1)/geo;
    921 
    922 	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
    923 	s/@\s/\/\//o;			# old->new style commentary
    924 
    925 	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
    926 	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
    927 	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
    928 	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
    929 	s/vext\.8/ext/o		or
    930 	s/vrev32\.8/rev32/o	or
    931 	s/vtst\.8/cmtst/o	or
    932 	s/vshr/ushr/o		or
    933 	s/^(\s+)v/$1/o		or	# strip off v prefix
    934 	s/\bbx\s+lr\b/ret/o;
    935 
    936 	# fix up remainig legacy suffixes
    937 	s/\.[ui]?8//o;
    938 	m/\],#8/o and s/\.16b/\.8b/go;
    939 	s/\.[ui]?32//o and s/\.16b/\.4s/go;
    940 	s/\.[ui]?64//o and s/\.16b/\.2d/go;
    941 	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
    942 
    943 	print $_,"\n";
    944     }
    945 } else {				######## 32-bit code
    946     my %opcode = (
    947 	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
    948 	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
    949 
    950     local *unaes = sub {
    951 	my ($mnemonic,$arg)=@_;
    952 
    953 	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
    954 	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
    955 					 |(($2&7)<<1) |(($2&8)<<2);
    956 	    # since ARMv7 instructions are always encoded little-endian.
    957 	    # correct solution is to use .inst directive, but older
    958 	    # assemblers don't implement it:-(
    959 	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
    960 			$word&0xff,($word>>8)&0xff,
    961 			($word>>16)&0xff,($word>>24)&0xff,
    962 			$mnemonic,$arg;
    963 	}
    964     };
    965 
    966     sub unvtbl {
    967 	my $arg=shift;
    968 
    969 	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
    970 	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
    971 		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
    972     }
    973 
    974     sub unvdup32 {
    975 	my $arg=shift;
    976 
    977 	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
    978 	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
    979     }
    980 
    981     sub unvmov32 {
    982 	my $arg=shift;
    983 
    984 	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
    985 	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
    986     }
    987 
    988     foreach(split("\n",$code)) {
    989 	s/\`([^\`]*)\`/eval($1)/geo;
    990 
    991 	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
    992 	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
    993 	s/\/\/\s?/@ /o;				# new->old style commentary
    994 
    995 	# fix up remainig new-style suffixes
    996 	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
    997 	s/\],#[0-9]+/]!/o;
    998 
    999 	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
   1000 	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
   1001 	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
   1002 	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
   1003 	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
   1004 	s/^(\s+)b\./$1b/o				or
   1005 	s/^(\s+)mov\./$1mov/o				or
   1006 	s/^(\s+)ret/$1bx\tlr/o;
   1007 
   1008 	print $_,"\n";
   1009     }
   1010 }
   1011 
   1012 close STDOUT;
   1013