Home | History | Annotate | Download | only in asm
      1 #include "arm_arch.h"
      2 
      3 .text
      4 .code	32
      5 
      6 .type	rem_4bit,%object
      7 .align	5
      8 rem_4bit:
      9 .short	0x0000,0x1C20,0x3840,0x2460
     10 .short	0x7080,0x6CA0,0x48C0,0x54E0
     11 .short	0xE100,0xFD20,0xD940,0xC560
     12 .short	0x9180,0x8DA0,0xA9C0,0xB5E0
     13 .size	rem_4bit,.-rem_4bit
     14 
     15 .type	rem_4bit_get,%function
     16 rem_4bit_get:
     17 	sub	r2,pc,#8
     18 	sub	r2,r2,#32	@ &rem_4bit
     19 	b	.Lrem_4bit_got
     20 	nop
     21 .size	rem_4bit_get,.-rem_4bit_get
     22 
     23 .global	gcm_ghash_4bit
     24 .type	gcm_ghash_4bit,%function
     25 gcm_ghash_4bit:
     26 	sub	r12,pc,#8
     27 	add	r3,r2,r3		@ r3 to point at the end
     28 	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
     29 	sub	r12,r12,#48		@ &rem_4bit
     30 
     31 	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
     32 	stmdb	sp!,{r4-r11}		@ ... to stack
     33 
     34 	ldrb	r12,[r2,#15]
     35 	ldrb	r14,[r0,#15]
     36 .Louter:
     37 	eor	r12,r12,r14
     38 	and	r14,r12,#0xf0
     39 	and	r12,r12,#0x0f
     40 	mov	r3,#14
     41 
     42 	add	r7,r1,r12,lsl#4
     43 	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
     44 	add	r11,r1,r14
     45 	ldrb	r12,[r2,#14]
     46 
     47 	and	r14,r4,#0xf		@ rem
     48 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
     49 	add	r14,r14,r14
     50 	eor	r4,r8,r4,lsr#4
     51 	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
     52 	eor	r4,r4,r5,lsl#28
     53 	ldrb	r14,[r0,#14]
     54 	eor	r5,r9,r5,lsr#4
     55 	eor	r5,r5,r6,lsl#28
     56 	eor	r6,r10,r6,lsr#4
     57 	eor	r6,r6,r7,lsl#28
     58 	eor	r7,r11,r7,lsr#4
     59 	eor	r12,r12,r14
     60 	and	r14,r12,#0xf0
     61 	and	r12,r12,#0x0f
     62 	eor	r7,r7,r8,lsl#16
     63 
     64 .Linner:
     65 	add	r11,r1,r12,lsl#4
     66 	and	r12,r4,#0xf		@ rem
     67 	subs	r3,r3,#1
     68 	add	r12,r12,r12
     69 	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
     70 	eor	r4,r8,r4,lsr#4
     71 	eor	r4,r4,r5,lsl#28
     72 	eor	r5,r9,r5,lsr#4
     73 	eor	r5,r5,r6,lsl#28
     74 	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
     75 	eor	r6,r10,r6,lsr#4
     76 	ldrplb	r12,[r2,r3]
     77 	eor	r6,r6,r7,lsl#28
     78 	eor	r7,r11,r7,lsr#4
     79 
     80 	add	r11,r1,r14
     81 	and	r14,r4,#0xf		@ rem
     82 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
     83 	add	r14,r14,r14
     84 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
     85 	eor	r4,r8,r4,lsr#4
     86 	ldrplb	r8,[r0,r3]
     87 	eor	r4,r4,r5,lsl#28
     88 	eor	r5,r9,r5,lsr#4
     89 	ldrh	r9,[sp,r14]
     90 	eor	r5,r5,r6,lsl#28
     91 	eor	r6,r10,r6,lsr#4
     92 	eor	r6,r6,r7,lsl#28
     93 	eorpl	r12,r12,r8
     94 	eor	r7,r11,r7,lsr#4
     95 	andpl	r14,r12,#0xf0
     96 	andpl	r12,r12,#0x0f
     97 	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
     98 	bpl	.Linner
     99 
    100 	ldr	r3,[sp,#32]		@ re-load r3/end
    101 	add	r2,r2,#16
    102 	mov	r14,r4
    103 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    104 	rev	r4,r4
    105 	str	r4,[r0,#12]
    106 #elif defined(__ARMEB__)
    107 	str	r4,[r0,#12]
    108 #else
    109 	mov	r9,r4,lsr#8
    110 	strb	r4,[r0,#12+3]
    111 	mov	r10,r4,lsr#16
    112 	strb	r9,[r0,#12+2]
    113 	mov	r11,r4,lsr#24
    114 	strb	r10,[r0,#12+1]
    115 	strb	r11,[r0,#12]
    116 #endif
    117 	cmp	r2,r3
    118 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    119 	rev	r5,r5
    120 	str	r5,[r0,#8]
    121 #elif defined(__ARMEB__)
    122 	str	r5,[r0,#8]
    123 #else
    124 	mov	r9,r5,lsr#8
    125 	strb	r5,[r0,#8+3]
    126 	mov	r10,r5,lsr#16
    127 	strb	r9,[r0,#8+2]
    128 	mov	r11,r5,lsr#24
    129 	strb	r10,[r0,#8+1]
    130 	strb	r11,[r0,#8]
    131 #endif
    132 	ldrneb	r12,[r2,#15]
    133 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    134 	rev	r6,r6
    135 	str	r6,[r0,#4]
    136 #elif defined(__ARMEB__)
    137 	str	r6,[r0,#4]
    138 #else
    139 	mov	r9,r6,lsr#8
    140 	strb	r6,[r0,#4+3]
    141 	mov	r10,r6,lsr#16
    142 	strb	r9,[r0,#4+2]
    143 	mov	r11,r6,lsr#24
    144 	strb	r10,[r0,#4+1]
    145 	strb	r11,[r0,#4]
    146 #endif
    147 
    148 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    149 	rev	r7,r7
    150 	str	r7,[r0,#0]
    151 #elif defined(__ARMEB__)
    152 	str	r7,[r0,#0]
    153 #else
    154 	mov	r9,r7,lsr#8
    155 	strb	r7,[r0,#0+3]
    156 	mov	r10,r7,lsr#16
    157 	strb	r9,[r0,#0+2]
    158 	mov	r11,r7,lsr#24
    159 	strb	r10,[r0,#0+1]
    160 	strb	r11,[r0,#0]
    161 #endif
    162 
    163 	bne	.Louter
    164 
    165 	add	sp,sp,#36
    166 #if __ARM_ARCH__>=5
    167 	ldmia	sp!,{r4-r11,pc}
    168 #else
    169 	ldmia	sp!,{r4-r11,lr}
    170 	tst	lr,#1
    171 	moveq	pc,lr			@ be binary compatible with V4, yet
    172 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    173 #endif
    174 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
    175 
    176 .global	gcm_gmult_4bit
    177 .type	gcm_gmult_4bit,%function
    178 gcm_gmult_4bit:
    179 	stmdb	sp!,{r4-r11,lr}
    180 	ldrb	r12,[r0,#15]
    181 	b	rem_4bit_get
    182 .Lrem_4bit_got:
    183 	and	r14,r12,#0xf0
    184 	and	r12,r12,#0x0f
    185 	mov	r3,#14
    186 
    187 	add	r7,r1,r12,lsl#4
    188 	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
    189 	ldrb	r12,[r0,#14]
    190 
    191 	add	r11,r1,r14
    192 	and	r14,r4,#0xf		@ rem
    193 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
    194 	add	r14,r14,r14
    195 	eor	r4,r8,r4,lsr#4
    196 	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
    197 	eor	r4,r4,r5,lsl#28
    198 	eor	r5,r9,r5,lsr#4
    199 	eor	r5,r5,r6,lsl#28
    200 	eor	r6,r10,r6,lsr#4
    201 	eor	r6,r6,r7,lsl#28
    202 	eor	r7,r11,r7,lsr#4
    203 	and	r14,r12,#0xf0
    204 	eor	r7,r7,r8,lsl#16
    205 	and	r12,r12,#0x0f
    206 
    207 .Loop:
    208 	add	r11,r1,r12,lsl#4
    209 	and	r12,r4,#0xf		@ rem
    210 	subs	r3,r3,#1
    211 	add	r12,r12,r12
    212 	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
    213 	eor	r4,r8,r4,lsr#4
    214 	eor	r4,r4,r5,lsl#28
    215 	eor	r5,r9,r5,lsr#4
    216 	eor	r5,r5,r6,lsl#28
    217 	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
    218 	eor	r6,r10,r6,lsr#4
    219 	ldrplb	r12,[r0,r3]
    220 	eor	r6,r6,r7,lsl#28
    221 	eor	r7,r11,r7,lsr#4
    222 
    223 	add	r11,r1,r14
    224 	and	r14,r4,#0xf		@ rem
    225 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    226 	add	r14,r14,r14
    227 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
    228 	eor	r4,r8,r4,lsr#4
    229 	eor	r4,r4,r5,lsl#28
    230 	eor	r5,r9,r5,lsr#4
    231 	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
    232 	eor	r5,r5,r6,lsl#28
    233 	eor	r6,r10,r6,lsr#4
    234 	eor	r6,r6,r7,lsl#28
    235 	eor	r7,r11,r7,lsr#4
    236 	andpl	r14,r12,#0xf0
    237 	andpl	r12,r12,#0x0f
    238 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    239 	bpl	.Loop
    240 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    241 	rev	r4,r4
    242 	str	r4,[r0,#12]
    243 #elif defined(__ARMEB__)
    244 	str	r4,[r0,#12]
    245 #else
    246 	mov	r9,r4,lsr#8
    247 	strb	r4,[r0,#12+3]
    248 	mov	r10,r4,lsr#16
    249 	strb	r9,[r0,#12+2]
    250 	mov	r11,r4,lsr#24
    251 	strb	r10,[r0,#12+1]
    252 	strb	r11,[r0,#12]
    253 #endif
    254 
    255 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    256 	rev	r5,r5
    257 	str	r5,[r0,#8]
    258 #elif defined(__ARMEB__)
    259 	str	r5,[r0,#8]
    260 #else
    261 	mov	r9,r5,lsr#8
    262 	strb	r5,[r0,#8+3]
    263 	mov	r10,r5,lsr#16
    264 	strb	r9,[r0,#8+2]
    265 	mov	r11,r5,lsr#24
    266 	strb	r10,[r0,#8+1]
    267 	strb	r11,[r0,#8]
    268 #endif
    269 
    270 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    271 	rev	r6,r6
    272 	str	r6,[r0,#4]
    273 #elif defined(__ARMEB__)
    274 	str	r6,[r0,#4]
    275 #else
    276 	mov	r9,r6,lsr#8
    277 	strb	r6,[r0,#4+3]
    278 	mov	r10,r6,lsr#16
    279 	strb	r9,[r0,#4+2]
    280 	mov	r11,r6,lsr#24
    281 	strb	r10,[r0,#4+1]
    282 	strb	r11,[r0,#4]
    283 #endif
    284 
    285 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    286 	rev	r7,r7
    287 	str	r7,[r0,#0]
    288 #elif defined(__ARMEB__)
    289 	str	r7,[r0,#0]
    290 #else
    291 	mov	r9,r7,lsr#8
    292 	strb	r7,[r0,#0+3]
    293 	mov	r10,r7,lsr#16
    294 	strb	r9,[r0,#0+2]
    295 	mov	r11,r7,lsr#24
    296 	strb	r10,[r0,#0+1]
    297 	strb	r11,[r0,#0]
    298 #endif
    299 
    300 #if __ARM_ARCH__>=5
    301 	ldmia	sp!,{r4-r11,pc}
    302 #else
    303 	ldmia	sp!,{r4-r11,lr}
    304 	tst	lr,#1
    305 	moveq	pc,lr			@ be binary compatible with V4, yet
    306 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    307 #endif
    308 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
    309 #if __ARM_ARCH__>=7
    310 .fpu	neon
    311 
    312 .global	gcm_gmult_neon
    313 .type	gcm_gmult_neon,%function
    314 .align	4
    315 gcm_gmult_neon:
    316 	sub		r1,#16		@ point at H in GCM128_CTX
    317 	vld1.64		d29,[r0,:64]!@ load Xi
    318 	vmov.i32	d5,#0xe1		@ our irreducible polynomial
    319 	vld1.64		d28,[r0,:64]!
    320 	vshr.u64	d5,#32
    321 	vldmia		r1,{d0-d1}	@ load H
    322 	veor		q12,q12
    323 #ifdef __ARMEL__
    324 	vrev64.8	q14,q14
    325 #endif
    326 	veor		q13,q13
    327 	veor		q11,q11
    328 	mov		r1,#16
    329 	veor		q10,q10
    330 	mov		r3,#16
    331 	veor		d2,d2
    332 	vdup.8		d4,d28[0]	@ broadcast lowest byte
    333 	b		.Linner_neon
    334 .size	gcm_gmult_neon,.-gcm_gmult_neon
    335 
    336 .global	gcm_ghash_neon
    337 .type	gcm_ghash_neon,%function
    338 .align	4
    339 gcm_ghash_neon:
    340 	vld1.64		d21,[r0,:64]!	@ load Xi
    341 	vmov.i32	d5,#0xe1		@ our irreducible polynomial
    342 	vld1.64		d20,[r0,:64]!
    343 	vshr.u64	d5,#32
    344 	vldmia		r0,{d0-d1}		@ load H
    345 	veor		q12,q12
    346 	nop
    347 #ifdef __ARMEL__
    348 	vrev64.8	q10,q10
    349 #endif
    350 .Louter_neon:
    351 	vld1.64		d29,[r2]!	@ load inp
    352 	veor		q13,q13
    353 	vld1.64		d28,[r2]!
    354 	veor		q11,q11
    355 	mov		r1,#16
    356 #ifdef __ARMEL__
    357 	vrev64.8	q14,q14
    358 #endif
    359 	veor		d2,d2
    360 	veor		q14,q10			@ inp^=Xi
    361 	veor		q10,q10
    362 	vdup.8		d4,d28[0]	@ broadcast lowest byte
    363 .Linner_neon:
    364 	subs		r1,r1,#1
    365 	vmull.p8	q9,d1,d4		@ H.loXi[i]
    366 	vmull.p8	q8,d0,d4		@ H.hiXi[i]
    367 	vext.8		q14,q12,#1		@ IN>>=8
    368 
    369 	veor		q10,q13		@ modulo-scheduled part
    370 	vshl.i64	d22,#48
    371 	vdup.8		d4,d28[0]	@ broadcast lowest byte
    372 	veor		d3,d18,d20
    373 
    374 	veor		d21,d22
    375 	vuzp.8		q9,q8
    376 	vsli.8		d2,d3,#1		@ compose the "carry" byte
    377 	vext.8		q10,q12,#1		@ Z>>=8
    378 
    379 	vmull.p8	q11,d2,d5		@ "carry"0xe1
    380 	vshr.u8		d2,d3,#7		@ save Z's bottom bit
    381 	vext.8		q13,q9,q12,#1	@ Qlo>>=8
    382 	veor		q10,q8
    383 	bne		.Linner_neon
    384 
    385 	veor		q10,q13		@ modulo-scheduled artefact
    386 	vshl.i64	d22,#48
    387 	veor		d21,d22
    388 
    389 	@ finalization, normalize Z:Zo
    390 	vand		d2,d5		@ suffices to mask the bit
    391 	vshr.u64	d3,d20,#63
    392 	vshl.i64	q10,#1
    393 	subs		r3,#16
    394 	vorr		q10,q1		@ Z=Z:Zo<<1
    395 	bne		.Louter_neon
    396 
    397 #ifdef __ARMEL__
    398 	vrev64.8	q10,q10
    399 #endif
    400 	sub		r0,#16
    401 	vst1.64		d21,[r0,:64]!	@ write out Xi
    402 	vst1.64		d20,[r0,:64]
    403 
    404 	.word	0xe12fff1e
    405 .size	gcm_ghash_neon,.-gcm_ghash_neon
    406 #endif
    407 .asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro (at) openssl.org>"
    408 .align  2
    409