Home | History | Annotate | Download | only in asm
      1 #include "arm_arch.h"
      2 
      3 .text
      4 .code	32
      5 
      6 #if __ARM_ARCH__>=7
      7 .fpu	neon
      8 
      9 .type	mul_1x1_neon,%function
     10 .align	5
     11 mul_1x1_neon:
     12 	vshl.u64	d2,d16,#8	@ q1-q3 are slided
     13 	vmull.p8	q0,d16,d17	@ abb
     14 	vshl.u64	d4,d16,#16
     15 	vmull.p8	q1,d2,d17	@ a<<8bb
     16 	vshl.u64	d6,d16,#24
     17 	vmull.p8	q2,d4,d17	@ a<<16bb
     18 	vshr.u64	d2,#8
     19 	vmull.p8	q3,d6,d17	@ a<<24bb
     20 	vshl.u64	d3,#24
     21 	veor		d0,d2
     22 	vshr.u64	d4,#16
     23 	veor		d0,d3
     24 	vshl.u64	d5,#16
     25 	veor		d0,d4
     26 	vshr.u64	d6,#24
     27 	veor		d0,d5
     28 	vshl.u64	d7,#8
     29 	veor		d0,d6
     30 	veor		d0,d7
     31 	.word	0xe12fff1e
     32 .size	mul_1x1_neon,.-mul_1x1_neon
     33 #endif
     34 .type	mul_1x1_ialu,%function
     35 .align	5
     36 mul_1x1_ialu:
     37 	mov	r4,#0
     38 	bic	r5,r1,#3<<30		@ a1=a&0x3fffffff
     39 	str	r4,[sp,#0]		@ tab[0]=0
     40 	add	r6,r5,r5		@ a2=a1<<1
     41 	str	r5,[sp,#4]		@ tab[1]=a1
     42 	eor	r7,r5,r6		@ a1^a2
     43 	str	r6,[sp,#8]		@ tab[2]=a2
     44 	mov	r8,r5,lsl#2		@ a4=a1<<2
     45 	str	r7,[sp,#12]		@ tab[3]=a1^a2
     46 	eor	r9,r5,r8		@ a1^a4
     47 	str	r8,[sp,#16]		@ tab[4]=a4
     48 	eor	r4,r6,r8		@ a2^a4
     49 	str	r9,[sp,#20]		@ tab[5]=a1^a4
     50 	eor	r7,r7,r8		@ a1^a2^a4
     51 	str	r4,[sp,#24]		@ tab[6]=a2^a4
     52 	and	r8,r12,r0,lsl#2
     53 	str	r7,[sp,#28]		@ tab[7]=a1^a2^a4
     54 
     55 	and	r9,r12,r0,lsr#1
     56 	ldr	r5,[sp,r8]		@ tab[b       & 0x7]
     57 	and	r8,r12,r0,lsr#4
     58 	ldr	r7,[sp,r9]		@ tab[b >>  3 & 0x7]
     59 	and	r9,r12,r0,lsr#7
     60 	ldr	r6,[sp,r8]		@ tab[b >>  6 & 0x7]
     61 	eor	r5,r5,r7,lsl#3	@ stall
     62 	mov	r4,r7,lsr#29
     63 	ldr	r7,[sp,r9]		@ tab[b >>  9 & 0x7]
     64 
     65 	and	r8,r12,r0,lsr#10
     66 	eor	r5,r5,r6,lsl#6
     67 	eor	r4,r4,r6,lsr#26
     68 	ldr	r6,[sp,r8]		@ tab[b >> 12 & 0x7]
     69 
     70 	and	r9,r12,r0,lsr#13
     71 	eor	r5,r5,r7,lsl#9
     72 	eor	r4,r4,r7,lsr#23
     73 	ldr	r7,[sp,r9]		@ tab[b >> 15 & 0x7]
     74 
     75 	and	r8,r12,r0,lsr#16
     76 	eor	r5,r5,r6,lsl#12
     77 	eor	r4,r4,r6,lsr#20
     78 	ldr	r6,[sp,r8]		@ tab[b >> 18 & 0x7]
     79 
     80 	and	r9,r12,r0,lsr#19
     81 	eor	r5,r5,r7,lsl#15
     82 	eor	r4,r4,r7,lsr#17
     83 	ldr	r7,[sp,r9]		@ tab[b >> 21 & 0x7]
     84 
     85 	and	r8,r12,r0,lsr#22
     86 	eor	r5,r5,r6,lsl#18
     87 	eor	r4,r4,r6,lsr#14
     88 	ldr	r6,[sp,r8]		@ tab[b >> 24 & 0x7]
     89 
     90 	and	r9,r12,r0,lsr#25
     91 	eor	r5,r5,r7,lsl#21
     92 	eor	r4,r4,r7,lsr#11
     93 	ldr	r7,[sp,r9]		@ tab[b >> 27 & 0x7]
     94 
     95 	tst	r1,#1<<30
     96 	and	r8,r12,r0,lsr#28
     97 	eor	r5,r5,r6,lsl#24
     98 	eor	r4,r4,r6,lsr#8
     99 	ldr	r6,[sp,r8]		@ tab[b >> 30      ]
    100 
    101 	eorne	r5,r5,r0,lsl#30
    102 	eorne	r4,r4,r0,lsr#2
    103 	tst	r1,#1<<31
    104 	eor	r5,r5,r7,lsl#27
    105 	eor	r4,r4,r7,lsr#5
    106 	eorne	r5,r5,r0,lsl#31
    107 	eorne	r4,r4,r0,lsr#1
    108 	eor	r5,r5,r6,lsl#30
    109 	eor	r4,r4,r6,lsr#2
    110 
    111 	mov	pc,lr
    112 .size	mul_1x1_ialu,.-mul_1x1_ialu
    113 .global	bn_GF2m_mul_2x2
    114 .type	bn_GF2m_mul_2x2,%function
    115 .align	5
    116 bn_GF2m_mul_2x2:
    117 #if __ARM_ARCH__>=7
    118 	ldr	r12,.LOPENSSL_armcap
    119 .Lpic:	ldr	r12,[pc,r12]
    120 	tst	r12,#1
    121 	beq	.Lialu
    122 
    123 	veor	d18,d18
    124 	vmov.32	d19,r3,r3		@ two copies of b1
    125 	vmov.32	d18[0],r1		@ a1
    126 
    127 	veor	d20,d20
    128 	vld1.32	d21[],[sp,:32]	@ two copies of b0
    129 	vmov.32	d20[0],r2		@ a0
    130 	mov	r12,lr
    131 
    132 	vmov	d16,d18
    133 	vmov	d17,d19
    134 	bl	mul_1x1_neon		@ a1b1
    135 	vmov	d22,d0
    136 
    137 	vmov	d16,d20
    138 	vmov	d17,d21
    139 	bl	mul_1x1_neon		@ a0b0
    140 	vmov	d23,d0
    141 
    142 	veor	d16,d20,d18
    143 	veor	d17,d21,d19
    144 	veor	d20,d23,d22
    145 	bl	mul_1x1_neon		@ (a0+a1)(b0+b1)
    146 
    147 	veor	d0,d20			@ (a0+a1)(b0+b1)-a0b0-a1b1
    148 	vshl.u64 d1,d0,#32
    149 	vshr.u64 d0,d0,#32
    150 	veor	d23,d1
    151 	veor	d22,d0
    152 	vst1.32	{d23[0]},[r0,:32]!
    153 	vst1.32	{d23[1]},[r0,:32]!
    154 	vst1.32	{d22[0]},[r0,:32]!
    155 	vst1.32	{d22[1]},[r0,:32]
    156 	bx	r12
    157 .align	4
    158 .Lialu:
    159 #endif
    160 	stmdb	sp!,{r4-r10,lr}
    161 	mov	r10,r0			@ reassign 1st argument
    162 	mov	r0,r3			@ r0=b1
    163 	ldr	r3,[sp,#32]		@ load b0
    164 	mov	r12,#7<<2
    165 	sub	sp,sp,#32		@ allocate tab[8]
    166 
    167 	bl	mul_1x1_ialu		@ a1b1
    168 	str	r5,[r10,#8]
    169 	str	r4,[r10,#12]
    170 
    171 	eor	r0,r0,r3		@ flip b0 and b1
    172 	 eor	r1,r1,r2		@ flip a0 and a1
    173 	eor	r3,r3,r0
    174 	 eor	r2,r2,r1
    175 	eor	r0,r0,r3
    176 	 eor	r1,r1,r2
    177 	bl	mul_1x1_ialu		@ a0b0
    178 	str	r5,[r10]
    179 	str	r4,[r10,#4]
    180 
    181 	eor	r1,r1,r2
    182 	eor	r0,r0,r3
    183 	bl	mul_1x1_ialu		@ (a1+a0)(b1+b0)
    184 	ldmia	r10,{r6-r9}
    185 	eor	r5,r5,r4
    186 	eor	r4,r4,r7
    187 	eor	r5,r5,r6
    188 	eor	r4,r4,r8
    189 	eor	r5,r5,r9
    190 	eor	r4,r4,r9
    191 	str	r4,[r10,#8]
    192 	eor	r5,r5,r4
    193 	add	sp,sp,#32		@ destroy tab[8]
    194 	str	r5,[r10,#4]
    195 
    196 #if __ARM_ARCH__>=5
    197 	ldmia	sp!,{r4-r10,pc}
    198 #else
    199 	ldmia	sp!,{r4-r10,lr}
    200 	tst	lr,#1
    201 	moveq	pc,lr			@ be binary compatible with V4, yet
    202 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    203 #endif
    204 .size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
    205 #if __ARM_ARCH__>=7
    206 .align	5
    207 .LOPENSSL_armcap:
    208 .word	OPENSSL_armcap_P-(.Lpic+8)
    209 #endif
    210 .asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro (at) openssl.org>"
    211 .align	5
    212 
    213 .comm	OPENSSL_armcap_P,4,4
    214