Home | History | Annotate | Download | only in asm
      1 #include "arm_arch.h"
      2 
      3 #if __ARM_ARCH__>=7
      4 .text
      5 .fpu	neon
      6 .code	32
      7 .align	5
      8 rcon:
      9 .long	0x01,0x01,0x01,0x01
     10 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
     11 .long	0x1b,0x1b,0x1b,0x1b
     12 
     13 .globl	aes_v8_set_encrypt_key
     14 .type	aes_v8_set_encrypt_key,%function
     15 .align	5
     16 aes_v8_set_encrypt_key:
     17 .Lenc_key:
     18 	adr	r3,rcon
     19 	cmp	r1,#192
     20 
     21 	veor	q0,q0,q0
     22 	vld1.8	{q3},[r0]!
     23 	mov	r1,#8		@ reuse r1
     24 	vld1.32	{q1,q2},[r3]!
     25 
     26 	blt	.Loop128
     27 	beq	.L192
     28 	b	.L256
     29 
     30 .align	4
     31 .Loop128:
     32 	vtbl.8	d20,{q3},d4
     33 	vtbl.8	d21,{q3},d5
     34 	vext.8	q9,q0,q3,#12
     35 	vst1.32	{q3},[r2]!
     36 	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
     37 	subs	r1,r1,#1
     38 
     39 	veor	q3,q3,q9
     40 	vext.8	q9,q0,q9,#12
     41 	veor	q3,q3,q9
     42 	vext.8	q9,q0,q9,#12
     43 	 veor	q10,q10,q1
     44 	veor	q3,q3,q9
     45 	vshl.u8	q1,q1,#1
     46 	veor	q3,q3,q10
     47 	bne	.Loop128
     48 
     49 	vld1.32	{q1},[r3]
     50 
     51 	vtbl.8	d20,{q3},d4
     52 	vtbl.8	d21,{q3},d5
     53 	vext.8	q9,q0,q3,#12
     54 	vst1.32	{q3},[r2]!
     55 	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
     56 
     57 	veor	q3,q3,q9
     58 	vext.8	q9,q0,q9,#12
     59 	veor	q3,q3,q9
     60 	vext.8	q9,q0,q9,#12
     61 	 veor	q10,q10,q1
     62 	veor	q3,q3,q9
     63 	vshl.u8	q1,q1,#1
     64 	veor	q3,q3,q10
     65 
     66 	vtbl.8	d20,{q3},d4
     67 	vtbl.8	d21,{q3},d5
     68 	vext.8	q9,q0,q3,#12
     69 	vst1.32	{q3},[r2]!
     70 	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
     71 
     72 	veor	q3,q3,q9
     73 	vext.8	q9,q0,q9,#12
     74 	veor	q3,q3,q9
     75 	vext.8	q9,q0,q9,#12
     76 	 veor	q10,q10,q1
     77 	veor	q3,q3,q9
     78 	veor	q3,q3,q10
     79 	vst1.32	{q3},[r2]
     80 	add	r2,r2,#0x50
     81 
     82 	mov	r12,#10
     83 	b	.Ldone
     84 
     85 .align	4
     86 .L192:
     87 	vld1.8	{d16},[r0]!
     88 	vmov.i8	q10,#8			@ borrow q10
     89 	vst1.32	{q3},[r2]!
     90 	vsub.i8	q2,q2,q10	@ adjust the mask
     91 
     92 .Loop192:
     93 	vtbl.8	d20,{q8},d4
     94 	vtbl.8	d21,{q8},d5
     95 	vext.8	q9,q0,q3,#12
     96 	vst1.32	{d16},[r2]!
     97 	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
     98 	subs	r1,r1,#1
     99 
    100 	veor	q3,q3,q9
    101 	vext.8	q9,q0,q9,#12
    102 	veor	q3,q3,q9
    103 	vext.8	q9,q0,q9,#12
    104 	veor	q3,q3,q9
    105 
    106 	vdup.32	q9,d7[1]
    107 	veor	q9,q9,q8
    108 	 veor	q10,q10,q1
    109 	vext.8	q8,q0,q8,#12
    110 	vshl.u8	q1,q1,#1
    111 	veor	q8,q8,q9
    112 	veor	q3,q3,q10
    113 	veor	q8,q8,q10
    114 	vst1.32	{q3},[r2]!
    115 	bne	.Loop192
    116 
    117 	mov	r12,#12
    118 	add	r2,r2,#0x20
    119 	b	.Ldone
    120 
    121 .align	4
    122 .L256:
    123 	vld1.8	{q8},[r0]
    124 	mov	r1,#7
    125 	mov	r12,#14
    126 	vst1.32	{q3},[r2]!
    127 
    128 .Loop256:
    129 	vtbl.8	d20,{q8},d4
    130 	vtbl.8	d21,{q8},d5
    131 	vext.8	q9,q0,q3,#12
    132 	vst1.32	{q8},[r2]!
    133 	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
    134 	subs	r1,r1,#1
    135 
    136 	veor	q3,q3,q9
    137 	vext.8	q9,q0,q9,#12
    138 	veor	q3,q3,q9
    139 	vext.8	q9,q0,q9,#12
    140 	 veor	q10,q10,q1
    141 	veor	q3,q3,q9
    142 	vshl.u8	q1,q1,#1
    143 	veor	q3,q3,q10
    144 	vst1.32	{q3},[r2]!
    145 	beq	.Ldone
    146 
    147 	vdup.32	q10,d7[1]
    148 	vext.8	q9,q0,q8,#12
    149 	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
    150 
    151 	veor	q8,q8,q9
    152 	vext.8	q9,q0,q9,#12
    153 	veor	q8,q8,q9
    154 	vext.8	q9,q0,q9,#12
    155 	veor	q8,q8,q9
    156 
    157 	veor	q8,q8,q10
    158 	b	.Loop256
    159 
    160 .Ldone:
    161 	str	r12,[r2]
    162 
    163 	eor	r0,r0,r0		@ return value
    164 
    165 	bx	lr
    166 .size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
    167 
    168 .globl	aes_v8_set_decrypt_key
    169 .type	aes_v8_set_decrypt_key,%function
    170 .align	5
    171 aes_v8_set_decrypt_key:
    172 	stmdb	sp!,{r4,lr}
    173 	bl	.Lenc_key
    174 
    175 	sub	r2,r2,#240		@ restore original r2
    176 	mov	r4,#-16
    177 	add	r0,r2,r12,lsl#4	@ end of key schedule
    178 
    179 	vld1.32	{q0},[r2]
    180 	vld1.32	{q1},[r0]
    181 	vst1.32	{q0},[r0],r4
    182 	vst1.32	{q1},[r2]!
    183 
    184 .Loop_imc:
    185 	vld1.32	{q0},[r2]
    186 	vld1.32	{q1},[r0]
    187 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    188 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    189 	vst1.32	{q0},[r0],r4
    190 	vst1.32	{q1},[r2]!
    191 	cmp	r0,r2
    192 	bhi	.Loop_imc
    193 
    194 	vld1.32	{q0},[r2]
    195 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    196 	vst1.32	{q0},[r0]
    197 
    198 	eor	r0,r0,r0		@ return value
    199 	ldmia	sp!,{r4,pc}
    200 .size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
    201 .globl	aes_v8_encrypt
    202 .type	aes_v8_encrypt,%function
    203 .align	5
    204 aes_v8_encrypt:
    205 	ldr	r3,[r2,#240]
    206 	vld1.32	{q0},[r2]!
    207 	vld1.8	{q2},[r0]
    208 	sub	r3,r3,#2
    209 	vld1.32	{q1},[r2]!
    210 
    211 .Loop_enc:
    212 	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
    213 	vld1.32	{q0},[r2]!
    214 	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
    215 	subs	r3,r3,#2
    216 	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
    217 	vld1.32	{q1},[r2]!
    218 	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
    219 	bgt	.Loop_enc
    220 
    221 	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
    222 	vld1.32	{q0},[r2]
    223 	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
    224 	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
    225 	veor	q2,q2,q0
    226 
    227 	vst1.8	{q2},[r1]
    228 	bx	lr
    229 .size	aes_v8_encrypt,.-aes_v8_encrypt
    230 .globl	aes_v8_decrypt
    231 .type	aes_v8_decrypt,%function
    232 .align	5
    233 aes_v8_decrypt:
    234 	ldr	r3,[r2,#240]
    235 	vld1.32	{q0},[r2]!
    236 	vld1.8	{q2},[r0]
    237 	sub	r3,r3,#2
    238 	vld1.32	{q1},[r2]!
    239 
    240 .Loop_dec:
    241 	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
    242 	vld1.32	{q0},[r2]!
    243 	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
    244 	subs	r3,r3,#2
    245 	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
    246 	vld1.32	{q1},[r2]!
    247 	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
    248 	bgt	.Loop_dec
    249 
    250 	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
    251 	vld1.32	{q0},[r2]
    252 	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
    253 	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
    254 	veor	q2,q2,q0
    255 
    256 	vst1.8	{q2},[r1]
    257 	bx	lr
    258 .size	aes_v8_decrypt,.-aes_v8_decrypt
    259 .globl	aes_v8_cbc_encrypt
    260 .type	aes_v8_cbc_encrypt,%function
    261 .align	5
    262 aes_v8_cbc_encrypt:
    263 	mov	ip,sp
    264 	stmdb	sp!,{r4-r8,lr}
    265 	vstmdb	sp!,{d8-d15}            @ ABI specification says so
    266 	ldmia	ip,{r4-r5}		@ load remaining args
    267 	subs	r2,r2,#16
    268 	mov	r8,#16
    269 	blo	.Lcbc_abort
    270 	moveq	r8,#0
    271 
    272 	cmp	r5,#0			@ en- or decrypting?
    273 	ldr	r5,[r3,#240]
    274 	and	r2,r2,#-16
    275 	vld1.8	{q6},[r4]
    276 	vld1.8	{q0},[r0],r8
    277 
    278 	vld1.32	{q8-q9},[r3]		@ load key schedule...
    279 	sub	r5,r5,#6
    280 	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
    281 	sub	r5,r5,#2
    282 	vld1.32	{q10-q11},[r7]!
    283 	vld1.32	{q12-q13},[r7]!
    284 	vld1.32	{q14-q15},[r7]!
    285 	vld1.32	{q7},[r7]
    286 
    287 	add	r7,r3,#32
    288 	mov	r6,r5
    289 	beq	.Lcbc_dec
    290 
    291 	cmp	r5,#2
    292 	veor	q0,q0,q6
    293 	veor	q5,q8,q7
    294 	beq	.Lcbc_enc128
    295 
    296 .Loop_cbc_enc:
    297 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    298 	vld1.32	{q8},[r7]!
    299 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    300 	subs	r6,r6,#2
    301 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    302 	vld1.32	{q9},[r7]!
    303 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    304 	bgt	.Loop_cbc_enc
    305 
    306 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    307 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    308 	 subs	r2,r2,#16
    309 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    310 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    311 	 moveq	r8,#0
    312 	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
    313 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    314 	 add	r7,r3,#16
    315 	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
    316 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    317 	 vld1.8	{q8},[r0],r8
    318 	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
    319 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    320 	 veor	q8,q8,q5
    321 	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
    322 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    323 	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
    324 	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
    325 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    326 	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
    327 
    328 	 mov	r6,r5
    329 	veor	q6,q0,q7
    330 	vst1.8	{q6},[r1]!
    331 	bhs	.Loop_cbc_enc
    332 
    333 	b	.Lcbc_done
    334 
    335 .align	5
    336 .Lcbc_enc128:
    337 	vld1.32	{q2-q3},[r7]
    338 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    339 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    340 	b	.Lenter_cbc_enc128
    341 .Loop_cbc_enc128:
    342 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    343 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    344 	 vst1.8	{q6},[r1]!
    345 .Lenter_cbc_enc128:
    346 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    347 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    348 	 subs	r2,r2,#16
    349 	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
    350 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    351 	 moveq	r8,#0
    352 	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
    353 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    354 	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
    355 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    356 	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
    357 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    358 	 vld1.8	{q8},[r0],r8
    359 	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
    360 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    361 	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
    362 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    363 	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
    364 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    365 	 veor	q8,q8,q5
    366 	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
    367 	veor	q6,q0,q7
    368 	bhs	.Loop_cbc_enc128
    369 
    370 	vst1.8	{q6},[r1]!
    371 	b	.Lcbc_done
    372 
    373 .align	5
    374 .Lcbc_dec128:
    375 	vld1.32	{q4-q5},[r7]
    376 	veor	q6,q6,q7
    377 	veor	q2,q0,q7
    378 	mov	r12,r8
    379 
    380 .Loop2x_cbc_dec128:
    381 	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
    382 	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
    383 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    384 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    385 	 subs	r2,r2,#32
    386 	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
    387 	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
    388 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    389 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    390 	 movlo	r8,#0
    391 	.byte	0x48,0x03,0xb0,0xf3	@ aesd q0,q4
    392 	.byte	0x48,0x23,0xb0,0xf3	@ aesd q1,q4
    393 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    394 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    395 	 movls	r12,#0
    396 	.byte	0x4a,0x03,0xb0,0xf3	@ aesd q0,q5
    397 	.byte	0x4a,0x23,0xb0,0xf3	@ aesd q1,q5
    398 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    399 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    400 	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
    401 	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10
    402 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    403 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    404 	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
    405 	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11
    406 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    407 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    408 	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
    409 	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
    410 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    411 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    412 	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
    413 	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
    414 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    415 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    416 	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
    417 	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
    418 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    419 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    420 	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
    421 	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
    422 
    423 	veor	q6,q6,q0
    424 	vld1.8	{q0},[r0],r8
    425 	veor	q2,q2,q1
    426 	vld1.8	{q1},[r0],r12
    427 	vst1.8	{q6},[r1]!
    428 	veor	q6,q3,q7
    429 	vst1.8	{q2},[r1]!
    430 	veor	q2,q0,q7
    431 	vorr	q3,q1,q1
    432 	bhs	.Loop2x_cbc_dec128
    433 
    434 	adds	r2,r2,#32
    435 	veor	q6,q6,q7
    436 	beq	.Lcbc_done
    437 	veor	q2,q2,q7
    438 	b	.Lcbc_dec_tail
    439 
    440 .align	5
    441 .Lcbc_dec:
    442 	subs	r2,r2,#16
    443 	vorr	q2,q0,q0
    444 	blo	.Lcbc_dec_tail
    445 
    446 	moveq	r8,#0
    447 	cmp	r5,#2
    448 	vld1.8	{q1},[r0],r8
    449 	vorr	q3,q1,q1
    450 	beq	.Lcbc_dec128
    451 
    452 .Loop2x_cbc_dec:
    453 	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
    454 	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
    455 	vld1.32	{q8},[r7]!
    456 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    457 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    458 	subs	r6,r6,#2
    459 	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
    460 	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
    461 	vld1.32	{q9},[r7]!
    462 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    463 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    464 	bgt	.Loop2x_cbc_dec
    465 
    466 	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
    467 	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
    468 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    469 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    470 	 veor	q4,q6,q7
    471 	 veor	q5,q2,q7
    472 	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
    473 	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
    474 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    475 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    476 	 vorr	q6,q3,q3
    477 	 subs	r2,r2,#32
    478 	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
    479 	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10
    480 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    481 	 movlo	r8,#0
    482 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    483 	 mov	r7,r3
    484 	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
    485 	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11
    486 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    487 	 vld1.8	{q2},[r0],r8
    488 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    489 	 movls	r8,#0
    490 	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
    491 	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
    492 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    493 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    494 	 vld1.8	{q3},[r0],r8
    495 	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
    496 	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
    497 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    498 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    499 	 vld1.32 {q8},[r7]!	@ re-pre-load rndkey[0]
    500 	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
    501 	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
    502 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    503 	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
    504 	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
    505 	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
    506 	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
    507 
    508 	 mov	r6,r5
    509 	veor	q4,q4,q0
    510 	veor	q5,q5,q1
    511 	 vorr	q0,q2,q2
    512 	vst1.8	{q4},[r1]!
    513 	 vorr	q1,q3,q3
    514 	vst1.8	{q5},[r1]!
    515 	bhs	.Loop2x_cbc_dec
    516 
    517 	adds	r2,r2,#32
    518 	beq	.Lcbc_done
    519 
    520 .Lcbc_dec_tail:
    521 	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
    522 	vld1.32	{q8},[r7]!
    523 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    524 	subs	r6,r6,#2
    525 	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
    526 	vld1.32	{q9},[r7]!
    527 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    528 	bgt	.Lcbc_dec_tail
    529 
    530 	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
    531 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    532 	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
    533 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    534 	 veor	q4,q6,q7
    535 	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
    536 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    537 	 vorr	q6,q2,q2
    538 	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
    539 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    540 	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
    541 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    542 	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
    543 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    544 	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
    545 	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
    546 	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
    547 
    548 	veor	q4,q4,q0
    549 	vst1.8	{q4},[r1]!
    550 
    551 .Lcbc_done:
    552 	vst1.8	{q6},[r4]
    553 .Lcbc_abort:
    554 	vldmia	sp!,{d8-d15}
    555 	ldmia	sp!,{r4-r8,pc}
    556 .size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
    557 .globl	aes_v8_ctr32_encrypt_blocks
    558 .type	aes_v8_ctr32_encrypt_blocks,%function
    559 .align	5
    560 aes_v8_ctr32_encrypt_blocks:
    561 	mov		ip,sp
    562 	stmdb		sp!,{r4-r10,lr}
    563 	vstmdb		sp!,{d8-d15}            @ ABI specification says so
    564 	ldr		r4, [ip]		@ load remaining arg
    565 	ldr		r5,[r3,#240]
    566 
    567 	ldr		r8, [r4, #12]
    568 	vld1.32		{q0},[r4]
    569 
    570 	vld1.32		{q8-q9},[r3]		@ load key schedule...
    571 	sub		r5,r5,#6
    572 	add		r7,r3,r5,lsl#4	@ pointer to last 7 round keys
    573 	sub		r5,r5,#2
    574 	vld1.32		{q10-q11},[r7]!
    575 	vld1.32		{q12-q13},[r7]!
    576 	vld1.32		{q14-q15},[r7]!
    577 	vld1.32		{q7},[r7]
    578 
    579 	add		r7,r3,#32
    580 	mov		r6,r5
    581 
    582 	subs		r2,r2,#2
    583 	blo		.Lctr32_tail
    584 
    585 #ifndef __ARMEB__
    586 	rev		r8, r8
    587 #endif
    588 	vorr		q1,q0,q0
    589 	add		r8, r8, #1
    590 	vorr		q6,q0,q0
    591 	rev		r10, r8
    592 	cmp		r5,#2
    593 	vmov.32	d3[1],r10
    594 	beq		.Lctr32_128
    595 
    596 .Loop2x_ctr32:
    597 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    598 	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
    599 	vld1.32		{q8},[r7]!
    600 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    601 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    602 	subs		r6,r6,#2
    603 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    604 	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
    605 	vld1.32		{q9},[r7]!
    606 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    607 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    608 	bgt		.Loop2x_ctr32
    609 
    610 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    611 	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
    612 	.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
    613 	 vorr		q0,q6,q6
    614 	.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
    615 	 vorr		q1,q6,q6
    616 	.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
    617 	.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
    618 	 vld1.8		{q2},[r0]!
    619 	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
    620 	 vld1.8		{q3},[r0]!
    621 	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
    622 	 add		r8,r8,#1
    623 	.byte	0x24,0x83,0xb0,0xf3	@ aese q4,q10
    624 	.byte	0x24,0xa3,0xb0,0xf3	@ aese q5,q10
    625 	 rev		r9,r8
    626 	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
    627 	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
    628 	 add		r8,r8,#1
    629 	.byte	0x26,0x83,0xb0,0xf3	@ aese q4,q11
    630 	.byte	0x26,0xa3,0xb0,0xf3	@ aese q5,q11
    631 	 veor		q2,q2,q7
    632 	 rev		r10,r8
    633 	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
    634 	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
    635 	 veor		q3,q3,q7
    636 	 mov		r7,r3
    637 	.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
    638 	.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
    639 	 subs		r2,r2,#2
    640 	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
    641 	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
    642 	 vld1.32	 {q8-q9},[r7]!	@ re-pre-load rndkey[0-1]
    643 	.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
    644 	.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
    645 	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
    646 	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
    647 	.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
    648 	.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
    649 	 vmov.32	d1[1], r9
    650 	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
    651 	 vmov.32	d3[1], r10
    652 	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
    653 	.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
    654 	.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
    655 
    656 	 mov		r6,r5
    657 	veor		q2,q2,q4
    658 	veor		q3,q3,q5
    659 	vst1.8		{q2},[r1]!
    660 	vst1.8		{q3},[r1]!
    661 	bhs		.Loop2x_ctr32
    662 
    663 	adds		r2,r2,#2
    664 	beq		.Lctr32_done
    665 	b		.Lctr32_tail
    666 
    667 .Lctr32_128:
    668 	vld1.32		{q4-q5},[r7]
    669 
    670 .Loop2x_ctr32_128:
    671 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    672 	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
    673 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    674 	 vld1.8		{q2},[r0]!
    675 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    676 	 vld1.8		{q3},[r0]!
    677 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    678 	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
    679 	 add		r8,r8,#1
    680 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    681 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    682 	 rev		r9,r8
    683 	.byte	0x08,0x03,0xb0,0xf3	@ aese q0,q4
    684 	.byte	0x08,0x23,0xb0,0xf3	@ aese q1,q4
    685 	 add		r8,r8,#1
    686 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    687 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    688 	 rev		r10,r8
    689 	.byte	0x0a,0x03,0xb0,0xf3	@ aese q0,q5
    690 	.byte	0x0a,0x23,0xb0,0xf3	@ aese q1,q5
    691 	 subs		r2,r2,#2
    692 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    693 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    694 	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
    695 	.byte	0x24,0x23,0xb0,0xf3	@ aese q1,q10
    696 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    697 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    698 	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
    699 	.byte	0x26,0x23,0xb0,0xf3	@ aese q1,q11
    700 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    701 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    702 	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
    703 	.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
    704 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    705 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    706 	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
    707 	.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
    708 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    709 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    710 	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
    711 	.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
    712 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    713 	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
    714 	 veor		q2,q2,q7
    715 	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
    716 	 veor		q3,q3,q7
    717 	.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
    718 
    719 	veor		q2,q2,q0
    720 	vorr		q0,q6,q6
    721 	veor		q3,q3,q1
    722 	vorr		q1,q6,q6
    723 	vst1.8		{q2},[r1]!
    724 	vmov.32	d1[1], r9
    725 	vst1.8		{q3},[r1]!
    726 	vmov.32	d3[1], r10
    727 	bhs		.Loop2x_ctr32_128
    728 
    729 	adds		r2,r2,#2
    730 	beq		.Lctr32_done
    731 
    732 .Lctr32_tail:
    733 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    734 	vld1.32		{q8},[r7]!
    735 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    736 	subs		r6,r6,#2
    737 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    738 	vld1.32		{q9},[r7]!
    739 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    740 	bgt		.Lctr32_tail
    741 
    742 	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
    743 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    744 	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
    745 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    746 	 vld1.8		{q2},[r0]
    747 	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
    748 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    749 	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
    750 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    751 	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
    752 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    753 	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
    754 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    755 	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
    756 	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
    757 	 veor		q2,q2,q7
    758 	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
    759 
    760 	veor		q2,q2,q0
    761 	vst1.8		{q2},[r1]
    762 
    763 .Lctr32_done:
    764 	vldmia		sp!,{d8-d15}
    765 	ldmia		sp!,{r4-r10,pc}
    766 .size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
    767 #endif
    768