Home | History | Annotate | Download | only in asm
      1 #include "arm_arch.h"
      2 
      3 .text
      4 .code	32
      5 
      6 .type	rem_4bit,%object
      7 .align	5
      8 rem_4bit:
      9 .short	0x0000,0x1C20,0x3840,0x2460
     10 .short	0x7080,0x6CA0,0x48C0,0x54E0
     11 .short	0xE100,0xFD20,0xD940,0xC560
     12 .short	0x9180,0x8DA0,0xA9C0,0xB5E0
     13 .size	rem_4bit,.-rem_4bit
     14 
     15 .type	rem_4bit_get,%function
     16 rem_4bit_get:
     17 	sub	r2,pc,#8
     18 	sub	r2,r2,#32	@ &rem_4bit
     19 	b	.Lrem_4bit_got
     20 	nop
     21 .size	rem_4bit_get,.-rem_4bit_get
     22 
     23 .global	gcm_ghash_4bit
     24 .type	gcm_ghash_4bit,%function
     25 gcm_ghash_4bit:
     26 	sub	r12,pc,#8
     27 	add	r3,r2,r3		@ r3 to point at the end
     28 	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
     29 	sub	r12,r12,#48		@ &rem_4bit
     30 
     31 	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
     32 	stmdb	sp!,{r4-r11}		@ ... to stack
     33 
     34 	ldrb	r12,[r2,#15]
     35 	ldrb	r14,[r0,#15]
     36 .Louter:
     37 	eor	r12,r12,r14
     38 	and	r14,r12,#0xf0
     39 	and	r12,r12,#0x0f
     40 	mov	r3,#14
     41 
     42 	add	r7,r1,r12,lsl#4
     43 	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
     44 	add	r11,r1,r14
     45 	ldrb	r12,[r2,#14]
     46 
     47 	and	r14,r4,#0xf		@ rem
     48 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
     49 	add	r14,r14,r14
     50 	eor	r4,r8,r4,lsr#4
     51 	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
     52 	eor	r4,r4,r5,lsl#28
     53 	ldrb	r14,[r0,#14]
     54 	eor	r5,r9,r5,lsr#4
     55 	eor	r5,r5,r6,lsl#28
     56 	eor	r6,r10,r6,lsr#4
     57 	eor	r6,r6,r7,lsl#28
     58 	eor	r7,r11,r7,lsr#4
     59 	eor	r12,r12,r14
     60 	and	r14,r12,#0xf0
     61 	and	r12,r12,#0x0f
     62 	eor	r7,r7,r8,lsl#16
     63 
     64 .Linner:
     65 	add	r11,r1,r12,lsl#4
     66 	and	r12,r4,#0xf		@ rem
     67 	subs	r3,r3,#1
     68 	add	r12,r12,r12
     69 	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
     70 	eor	r4,r8,r4,lsr#4
     71 	eor	r4,r4,r5,lsl#28
     72 	eor	r5,r9,r5,lsr#4
     73 	eor	r5,r5,r6,lsl#28
     74 	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
     75 	eor	r6,r10,r6,lsr#4
     76 	ldrplb	r12,[r2,r3]
     77 	eor	r6,r6,r7,lsl#28
     78 	eor	r7,r11,r7,lsr#4
     79 
     80 	add	r11,r1,r14
     81 	and	r14,r4,#0xf		@ rem
     82 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
     83 	add	r14,r14,r14
     84 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
     85 	eor	r4,r8,r4,lsr#4
     86 	ldrplb	r8,[r0,r3]
     87 	eor	r4,r4,r5,lsl#28
     88 	eor	r5,r9,r5,lsr#4
     89 	ldrh	r9,[sp,r14]
     90 	eor	r5,r5,r6,lsl#28
     91 	eor	r6,r10,r6,lsr#4
     92 	eor	r6,r6,r7,lsl#28
     93 	eorpl	r12,r12,r8
     94 	eor	r7,r11,r7,lsr#4
     95 	andpl	r14,r12,#0xf0
     96 	andpl	r12,r12,#0x0f
     97 	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
     98 	bpl	.Linner
     99 
    100 	ldr	r3,[sp,#32]		@ re-load r3/end
    101 	add	r2,r2,#16
    102 	mov	r14,r4
    103 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    104 	rev	r4,r4
    105 	str	r4,[r0,#12]
    106 #elif defined(__ARMEB__)
    107 	str	r4,[r0,#12]
    108 #else
    109 	mov	r9,r4,lsr#8
    110 	strb	r4,[r0,#12+3]
    111 	mov	r10,r4,lsr#16
    112 	strb	r9,[r0,#12+2]
    113 	mov	r11,r4,lsr#24
    114 	strb	r10,[r0,#12+1]
    115 	strb	r11,[r0,#12]
    116 #endif
    117 	cmp	r2,r3
    118 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    119 	rev	r5,r5
    120 	str	r5,[r0,#8]
    121 #elif defined(__ARMEB__)
    122 	str	r5,[r0,#8]
    123 #else
    124 	mov	r9,r5,lsr#8
    125 	strb	r5,[r0,#8+3]
    126 	mov	r10,r5,lsr#16
    127 	strb	r9,[r0,#8+2]
    128 	mov	r11,r5,lsr#24
    129 	strb	r10,[r0,#8+1]
    130 	strb	r11,[r0,#8]
    131 #endif
    132 	ldrneb	r12,[r2,#15]
    133 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    134 	rev	r6,r6
    135 	str	r6,[r0,#4]
    136 #elif defined(__ARMEB__)
    137 	str	r6,[r0,#4]
    138 #else
    139 	mov	r9,r6,lsr#8
    140 	strb	r6,[r0,#4+3]
    141 	mov	r10,r6,lsr#16
    142 	strb	r9,[r0,#4+2]
    143 	mov	r11,r6,lsr#24
    144 	strb	r10,[r0,#4+1]
    145 	strb	r11,[r0,#4]
    146 #endif
    147 
    148 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    149 	rev	r7,r7
    150 	str	r7,[r0,#0]
    151 #elif defined(__ARMEB__)
    152 	str	r7,[r0,#0]
    153 #else
    154 	mov	r9,r7,lsr#8
    155 	strb	r7,[r0,#0+3]
    156 	mov	r10,r7,lsr#16
    157 	strb	r9,[r0,#0+2]
    158 	mov	r11,r7,lsr#24
    159 	strb	r10,[r0,#0+1]
    160 	strb	r11,[r0,#0]
    161 #endif
    162 
    163 	bne	.Louter
    164 
    165 	add	sp,sp,#36
    166 #if __ARM_ARCH__>=5
    167 	ldmia	sp!,{r4-r11,pc}
    168 #else
    169 	ldmia	sp!,{r4-r11,lr}
    170 	tst	lr,#1
    171 	moveq	pc,lr			@ be binary compatible with V4, yet
    172 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    173 #endif
    174 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
    175 
    176 .global	gcm_gmult_4bit
    177 .type	gcm_gmult_4bit,%function
    178 gcm_gmult_4bit:
    179 	stmdb	sp!,{r4-r11,lr}
    180 	ldrb	r12,[r0,#15]
    181 	b	rem_4bit_get
    182 .Lrem_4bit_got:
    183 	and	r14,r12,#0xf0
    184 	and	r12,r12,#0x0f
    185 	mov	r3,#14
    186 
    187 	add	r7,r1,r12,lsl#4
    188 	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
    189 	ldrb	r12,[r0,#14]
    190 
    191 	add	r11,r1,r14
    192 	and	r14,r4,#0xf		@ rem
    193 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
    194 	add	r14,r14,r14
    195 	eor	r4,r8,r4,lsr#4
    196 	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
    197 	eor	r4,r4,r5,lsl#28
    198 	eor	r5,r9,r5,lsr#4
    199 	eor	r5,r5,r6,lsl#28
    200 	eor	r6,r10,r6,lsr#4
    201 	eor	r6,r6,r7,lsl#28
    202 	eor	r7,r11,r7,lsr#4
    203 	and	r14,r12,#0xf0
    204 	eor	r7,r7,r8,lsl#16
    205 	and	r12,r12,#0x0f
    206 
    207 .Loop:
    208 	add	r11,r1,r12,lsl#4
    209 	and	r12,r4,#0xf		@ rem
    210 	subs	r3,r3,#1
    211 	add	r12,r12,r12
    212 	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
    213 	eor	r4,r8,r4,lsr#4
    214 	eor	r4,r4,r5,lsl#28
    215 	eor	r5,r9,r5,lsr#4
    216 	eor	r5,r5,r6,lsl#28
    217 	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
    218 	eor	r6,r10,r6,lsr#4
    219 	ldrplb	r12,[r0,r3]
    220 	eor	r6,r6,r7,lsl#28
    221 	eor	r7,r11,r7,lsr#4
    222 
    223 	add	r11,r1,r14
    224 	and	r14,r4,#0xf		@ rem
    225 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    226 	add	r14,r14,r14
    227 	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
    228 	eor	r4,r8,r4,lsr#4
    229 	eor	r4,r4,r5,lsl#28
    230 	eor	r5,r9,r5,lsr#4
    231 	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
    232 	eor	r5,r5,r6,lsl#28
    233 	eor	r6,r10,r6,lsr#4
    234 	eor	r6,r6,r7,lsl#28
    235 	eor	r7,r11,r7,lsr#4
    236 	andpl	r14,r12,#0xf0
    237 	andpl	r12,r12,#0x0f
    238 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    239 	bpl	.Loop
    240 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    241 	rev	r4,r4
    242 	str	r4,[r0,#12]
    243 #elif defined(__ARMEB__)
    244 	str	r4,[r0,#12]
    245 #else
    246 	mov	r9,r4,lsr#8
    247 	strb	r4,[r0,#12+3]
    248 	mov	r10,r4,lsr#16
    249 	strb	r9,[r0,#12+2]
    250 	mov	r11,r4,lsr#24
    251 	strb	r10,[r0,#12+1]
    252 	strb	r11,[r0,#12]
    253 #endif
    254 
    255 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    256 	rev	r5,r5
    257 	str	r5,[r0,#8]
    258 #elif defined(__ARMEB__)
    259 	str	r5,[r0,#8]
    260 #else
    261 	mov	r9,r5,lsr#8
    262 	strb	r5,[r0,#8+3]
    263 	mov	r10,r5,lsr#16
    264 	strb	r9,[r0,#8+2]
    265 	mov	r11,r5,lsr#24
    266 	strb	r10,[r0,#8+1]
    267 	strb	r11,[r0,#8]
    268 #endif
    269 
    270 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    271 	rev	r6,r6
    272 	str	r6,[r0,#4]
    273 #elif defined(__ARMEB__)
    274 	str	r6,[r0,#4]
    275 #else
    276 	mov	r9,r6,lsr#8
    277 	strb	r6,[r0,#4+3]
    278 	mov	r10,r6,lsr#16
    279 	strb	r9,[r0,#4+2]
    280 	mov	r11,r6,lsr#24
    281 	strb	r10,[r0,#4+1]
    282 	strb	r11,[r0,#4]
    283 #endif
    284 
    285 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    286 	rev	r7,r7
    287 	str	r7,[r0,#0]
    288 #elif defined(__ARMEB__)
    289 	str	r7,[r0,#0]
    290 #else
    291 	mov	r9,r7,lsr#8
    292 	strb	r7,[r0,#0+3]
    293 	mov	r10,r7,lsr#16
    294 	strb	r9,[r0,#0+2]
    295 	mov	r11,r7,lsr#24
    296 	strb	r10,[r0,#0+1]
    297 	strb	r11,[r0,#0]
    298 #endif
    299 
    300 #if __ARM_ARCH__>=5
    301 	ldmia	sp!,{r4-r11,pc}
    302 #else
    303 	ldmia	sp!,{r4-r11,lr}
    304 	tst	lr,#1
    305 	moveq	pc,lr			@ be binary compatible with V4, yet
    306 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    307 #endif
    308 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
    309 #if __ARM_ARCH__>=7
    310 .fpu	neon
    311 
    312 .global	gcm_init_neon
    313 .type	gcm_init_neon,%function
    314 .align	4
    315 gcm_init_neon:
    316 	vld1.64		d7,[r1,:64]!	@ load H
    317 	vmov.i8		q8,#0xe1
    318 	vld1.64		d6,[r1,:64]
    319 	vshl.i64	d17,#57
    320 	vshr.u64	d16,#63		@ t0=0xc2....01
    321 	vdup.8		q9,d7[7]
    322 	vshr.u64	d26,d6,#63
    323 	vshr.s8		q9,#7			@ broadcast carry bit
    324 	vshl.i64	q3,q3,#1
    325 	vand		q8,q8,q9
    326 	vorr		d7,d26		@ H<<<=1
    327 	veor		q3,q3,q8		@ twisted H
    328 	vstmia		r0,{q3}
    329 
    330 	bx	lr					@ bx lr
    331 .size	gcm_init_neon,.-gcm_init_neon
    332 
    333 .global	gcm_gmult_neon
    334 .type	gcm_gmult_neon,%function
    335 .align	4
    336 gcm_gmult_neon:
    337 	vld1.64		d7,[r0,:64]!	@ load Xi
    338 	vld1.64		d6,[r0,:64]!
    339 	vmov.i64	d29,#0x0000ffffffffffff
    340 	vldmia		r1,{d26-d27}	@ load twisted H
    341 	vmov.i64	d30,#0x00000000ffffffff
    342 #ifdef __ARMEL__
    343 	vrev64.8	q3,q3
    344 #endif
    345 	vmov.i64	d31,#0x000000000000ffff
    346 	veor		d28,d26,d27		@ Karatsuba pre-processing
    347 	mov		r3,#16
    348 	b		.Lgmult_neon
    349 .size	gcm_gmult_neon,.-gcm_gmult_neon
    350 
    351 .global	gcm_ghash_neon
    352 .type	gcm_ghash_neon,%function
    353 .align	4
    354 gcm_ghash_neon:
    355 	vld1.64		d1,[r0,:64]!	@ load Xi
    356 	vld1.64		d0,[r0,:64]!
    357 	vmov.i64	d29,#0x0000ffffffffffff
    358 	vldmia		r1,{d26-d27}	@ load twisted H
    359 	vmov.i64	d30,#0x00000000ffffffff
    360 #ifdef __ARMEL__
    361 	vrev64.8	q0,q0
    362 #endif
    363 	vmov.i64	d31,#0x000000000000ffff
    364 	veor		d28,d26,d27		@ Karatsuba pre-processing
    365 
    366 .Loop_neon:
    367 	vld1.64		d7,[r2]!		@ load inp
    368 	vld1.64		d6,[r2]!
    369 #ifdef __ARMEL__
    370 	vrev64.8	q3,q3
    371 #endif
    372 	veor		q3,q0			@ inp^=Xi
    373 .Lgmult_neon:
    374 	vext.8		d16, d26, d26, #1	@ A1
    375 	vmull.p8	q8, d16, d6		@ F = A1*B
    376 	vext.8		d0, d6, d6, #1	@ B1
    377 	vmull.p8	q0, d26, d0		@ E = A*B1
    378 	vext.8		d18, d26, d26, #2	@ A2
    379 	vmull.p8	q9, d18, d6		@ H = A2*B
    380 	vext.8		d22, d6, d6, #2	@ B2
    381 	vmull.p8	q11, d26, d22		@ G = A*B2
    382 	vext.8		d20, d26, d26, #3	@ A3
    383 	veor		q8, q8, q0		@ L = E + F
    384 	vmull.p8	q10, d20, d6		@ J = A3*B
    385 	vext.8		d0, d6, d6, #3	@ B3
    386 	veor		q9, q9, q11		@ M = G + H
    387 	vmull.p8	q0, d26, d0		@ I = A*B3
    388 	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
    389 	vand		d17, d17, d29
    390 	vext.8		d22, d6, d6, #4	@ B4
    391 	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
    392 	vand		d19, d19, d30
    393 	vmull.p8	q11, d26, d22		@ K = A*B4
    394 	veor		q10, q10, q0		@ N = I + J
    395 	veor		d16, d16, d17
    396 	veor		d18, d18, d19
    397 	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
    398 	vand		d21, d21, d31
    399 	vext.8		q8, q8, q8, #15
    400 	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
    401 	vmov.i64	d23, #0
    402 	vext.8		q9, q9, q9, #14
    403 	veor		d20, d20, d21
    404 	vmull.p8	q0, d26, d6		@ D = A*B
    405 	vext.8		q11, q11, q11, #12
    406 	vext.8		q10, q10, q10, #13
    407 	veor		q8, q8, q9
    408 	veor		q10, q10, q11
    409 	veor		q0, q0, q8
    410 	veor		q0, q0, q10
    411 	veor		d6,d6,d7	@ Karatsuba pre-processing
    412 	vext.8		d16, d28, d28, #1	@ A1
    413 	vmull.p8	q8, d16, d6		@ F = A1*B
    414 	vext.8		d2, d6, d6, #1	@ B1
    415 	vmull.p8	q1, d28, d2		@ E = A*B1
    416 	vext.8		d18, d28, d28, #2	@ A2
    417 	vmull.p8	q9, d18, d6		@ H = A2*B
    418 	vext.8		d22, d6, d6, #2	@ B2
    419 	vmull.p8	q11, d28, d22		@ G = A*B2
    420 	vext.8		d20, d28, d28, #3	@ A3
    421 	veor		q8, q8, q1		@ L = E + F
    422 	vmull.p8	q10, d20, d6		@ J = A3*B
    423 	vext.8		d2, d6, d6, #3	@ B3
    424 	veor		q9, q9, q11		@ M = G + H
    425 	vmull.p8	q1, d28, d2		@ I = A*B3
    426 	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
    427 	vand		d17, d17, d29
    428 	vext.8		d22, d6, d6, #4	@ B4
    429 	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
    430 	vand		d19, d19, d30
    431 	vmull.p8	q11, d28, d22		@ K = A*B4
    432 	veor		q10, q10, q1		@ N = I + J
    433 	veor		d16, d16, d17
    434 	veor		d18, d18, d19
    435 	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
    436 	vand		d21, d21, d31
    437 	vext.8		q8, q8, q8, #15
    438 	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
    439 	vmov.i64	d23, #0
    440 	vext.8		q9, q9, q9, #14
    441 	veor		d20, d20, d21
    442 	vmull.p8	q1, d28, d6		@ D = A*B
    443 	vext.8		q11, q11, q11, #12
    444 	vext.8		q10, q10, q10, #13
    445 	veor		q8, q8, q9
    446 	veor		q10, q10, q11
    447 	veor		q1, q1, q8
    448 	veor		q1, q1, q10
    449 	vext.8		d16, d27, d27, #1	@ A1
    450 	vmull.p8	q8, d16, d7		@ F = A1*B
    451 	vext.8		d4, d7, d7, #1	@ B1
    452 	vmull.p8	q2, d27, d4		@ E = A*B1
    453 	vext.8		d18, d27, d27, #2	@ A2
    454 	vmull.p8	q9, d18, d7		@ H = A2*B
    455 	vext.8		d22, d7, d7, #2	@ B2
    456 	vmull.p8	q11, d27, d22		@ G = A*B2
    457 	vext.8		d20, d27, d27, #3	@ A3
    458 	veor		q8, q8, q2		@ L = E + F
    459 	vmull.p8	q10, d20, d7		@ J = A3*B
    460 	vext.8		d4, d7, d7, #3	@ B3
    461 	veor		q9, q9, q11		@ M = G + H
    462 	vmull.p8	q2, d27, d4		@ I = A*B3
    463 	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
    464 	vand		d17, d17, d29
    465 	vext.8		d22, d7, d7, #4	@ B4
    466 	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
    467 	vand		d19, d19, d30
    468 	vmull.p8	q11, d27, d22		@ K = A*B4
    469 	veor		q10, q10, q2		@ N = I + J
    470 	veor		d16, d16, d17
    471 	veor		d18, d18, d19
    472 	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
    473 	vand		d21, d21, d31
    474 	vext.8		q8, q8, q8, #15
    475 	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
    476 	vmov.i64	d23, #0
    477 	vext.8		q9, q9, q9, #14
    478 	veor		d20, d20, d21
    479 	vmull.p8	q2, d27, d7		@ D = A*B
    480 	vext.8		q11, q11, q11, #12
    481 	vext.8		q10, q10, q10, #13
    482 	veor		q8, q8, q9
    483 	veor		q10, q10, q11
    484 	veor		q2, q2, q8
    485 	veor		q2, q2, q10
    486 	veor		q1,q1,q0		@ Karatsuba post-processing
    487 	veor		q1,q1,q2
    488 	veor		d1,d1,d2
    489 	veor		d4,d4,d3	@ Xh|Xl - 256-bit result
    490 
    491 	@ equivalent of reduction_avx from ghash-x86_64.pl
    492 	vshl.i64	q9,q0,#57		@ 1st phase
    493 	vshl.i64	q10,q0,#62
    494 	veor		q10,q10,q9		@
    495 	vshl.i64	q9,q0,#63
    496 	veor		q10, q10, q9		@
    497  	veor		d1,d1,d20	@
    498 	veor		d4,d4,d21
    499 
    500 	vshr.u64	q10,q0,#1		@ 2nd phase
    501 	veor		q2,q2,q0
    502 	veor		q0,q0,q10		@
    503 	vshr.u64	q10,q10,#6
    504 	vshr.u64	q0,q0,#1		@
    505 	veor		q0,q0,q2		@
    506 	veor		q0,q0,q10		@
    507 
    508 	subs		r3,#16
    509 	bne		.Loop_neon
    510 
    511 #ifdef __ARMEL__
    512 	vrev64.8	q0,q0
    513 #endif
    514 	sub		r0,#16
    515 	vst1.64		d1,[r0,:64]!	@ write out Xi
    516 	vst1.64		d0,[r0,:64]
    517 
    518 	bx	lr					@ bx lr
    519 .size	gcm_ghash_neon,.-gcm_ghash_neon
    520 #endif
    521 .asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro (at) openssl.org>"
    522 .align  2
    523