Home | History | Annotate | Download | only in asm
      1 #include "arm_arch.h"
      2 
      3 .text
      4 .code	32
      5 
      6 #if __ARM_ARCH__>=7
      7 .fpu	neon
      8 #endif
      9 .type	mul_1x1_ialu,%function
     10 .align	5
     11 mul_1x1_ialu:
     12 	mov	r4,#0
     13 	bic	r5,r1,#3<<30		@ a1=a&0x3fffffff
     14 	str	r4,[sp,#0]		@ tab[0]=0
     15 	add	r6,r5,r5		@ a2=a1<<1
     16 	str	r5,[sp,#4]		@ tab[1]=a1
     17 	eor	r7,r5,r6		@ a1^a2
     18 	str	r6,[sp,#8]		@ tab[2]=a2
     19 	mov	r8,r5,lsl#2		@ a4=a1<<2
     20 	str	r7,[sp,#12]		@ tab[3]=a1^a2
     21 	eor	r9,r5,r8		@ a1^a4
     22 	str	r8,[sp,#16]		@ tab[4]=a4
     23 	eor	r4,r6,r8		@ a2^a4
     24 	str	r9,[sp,#20]		@ tab[5]=a1^a4
     25 	eor	r7,r7,r8		@ a1^a2^a4
     26 	str	r4,[sp,#24]		@ tab[6]=a2^a4
     27 	and	r8,r12,r0,lsl#2
     28 	str	r7,[sp,#28]		@ tab[7]=a1^a2^a4
     29 
     30 	and	r9,r12,r0,lsr#1
     31 	ldr	r5,[sp,r8]		@ tab[b       & 0x7]
     32 	and	r8,r12,r0,lsr#4
     33 	ldr	r7,[sp,r9]		@ tab[b >>  3 & 0x7]
     34 	and	r9,r12,r0,lsr#7
     35 	ldr	r6,[sp,r8]		@ tab[b >>  6 & 0x7]
     36 	eor	r5,r5,r7,lsl#3	@ stall
     37 	mov	r4,r7,lsr#29
     38 	ldr	r7,[sp,r9]		@ tab[b >>  9 & 0x7]
     39 
     40 	and	r8,r12,r0,lsr#10
     41 	eor	r5,r5,r6,lsl#6
     42 	eor	r4,r4,r6,lsr#26
     43 	ldr	r6,[sp,r8]		@ tab[b >> 12 & 0x7]
     44 
     45 	and	r9,r12,r0,lsr#13
     46 	eor	r5,r5,r7,lsl#9
     47 	eor	r4,r4,r7,lsr#23
     48 	ldr	r7,[sp,r9]		@ tab[b >> 15 & 0x7]
     49 
     50 	and	r8,r12,r0,lsr#16
     51 	eor	r5,r5,r6,lsl#12
     52 	eor	r4,r4,r6,lsr#20
     53 	ldr	r6,[sp,r8]		@ tab[b >> 18 & 0x7]
     54 
     55 	and	r9,r12,r0,lsr#19
     56 	eor	r5,r5,r7,lsl#15
     57 	eor	r4,r4,r7,lsr#17
     58 	ldr	r7,[sp,r9]		@ tab[b >> 21 & 0x7]
     59 
     60 	and	r8,r12,r0,lsr#22
     61 	eor	r5,r5,r6,lsl#18
     62 	eor	r4,r4,r6,lsr#14
     63 	ldr	r6,[sp,r8]		@ tab[b >> 24 & 0x7]
     64 
     65 	and	r9,r12,r0,lsr#25
     66 	eor	r5,r5,r7,lsl#21
     67 	eor	r4,r4,r7,lsr#11
     68 	ldr	r7,[sp,r9]		@ tab[b >> 27 & 0x7]
     69 
     70 	tst	r1,#1<<30
     71 	and	r8,r12,r0,lsr#28
     72 	eor	r5,r5,r6,lsl#24
     73 	eor	r4,r4,r6,lsr#8
     74 	ldr	r6,[sp,r8]		@ tab[b >> 30      ]
     75 
     76 	eorne	r5,r5,r0,lsl#30
     77 	eorne	r4,r4,r0,lsr#2
     78 	tst	r1,#1<<31
     79 	eor	r5,r5,r7,lsl#27
     80 	eor	r4,r4,r7,lsr#5
     81 	eorne	r5,r5,r0,lsl#31
     82 	eorne	r4,r4,r0,lsr#1
     83 	eor	r5,r5,r6,lsl#30
     84 	eor	r4,r4,r6,lsr#2
     85 
     86 	mov	pc,lr
     87 .size	mul_1x1_ialu,.-mul_1x1_ialu
     88 .global	bn_GF2m_mul_2x2
     89 .type	bn_GF2m_mul_2x2,%function
     90 .align	5
     91 bn_GF2m_mul_2x2:
     92 #if __ARM_ARCH__>=7
     93 	ldr	r12,.LOPENSSL_armcap
     94 .Lpic:	ldr	r12,[pc,r12]
     95 	tst	r12,#1
     96 	beq	.Lialu
     97 
     98 	ldr		r12, [sp]		@ 5th argument
     99 	vmov.32		d26, r2, r1
    100 	vmov.32		d27, r12, r3
    101 	vmov.i64	d28, #0x0000ffffffffffff
    102 	vmov.i64	d29, #0x00000000ffffffff
    103 	vmov.i64	d30, #0x000000000000ffff
    104 
    105 	vext.8		d2, d26, d26, #1	@ A1
    106 	vmull.p8	q1, d2, d27		@ F = A1*B
    107 	vext.8		d0, d27, d27, #1	@ B1
    108 	vmull.p8	q0, d26, d0		@ E = A*B1
    109 	vext.8		d4, d26, d26, #2	@ A2
    110 	vmull.p8	q2, d4, d27		@ H = A2*B
    111 	vext.8		d16, d27, d27, #2	@ B2
    112 	vmull.p8	q8, d26, d16		@ G = A*B2
    113 	vext.8		d6, d26, d26, #3	@ A3
    114 	veor		q1, q1, q0		@ L = E + F
    115 	vmull.p8	q3, d6, d27		@ J = A3*B
    116 	vext.8		d0, d27, d27, #3	@ B3
    117 	veor		q2, q2, q8		@ M = G + H
    118 	vmull.p8	q0, d26, d0		@ I = A*B3
    119 	veor		d2, d2, d3	@ t0 = (L) (P0 + P1) << 8
    120 	vand		d3, d3, d28
    121 	vext.8		d16, d27, d27, #4	@ B4
    122 	veor		d4, d4, d5	@ t1 = (M) (P2 + P3) << 16
    123 	vand		d5, d5, d29
    124 	vmull.p8	q8, d26, d16		@ K = A*B4
    125 	veor		q3, q3, q0		@ N = I + J
    126 	veor		d2, d2, d3
    127 	veor		d4, d4, d5
    128 	veor		d6, d6, d7	@ t2 = (N) (P4 + P5) << 24
    129 	vand		d7, d7, d30
    130 	vext.8		q1, q1, q1, #15
    131 	veor		d16, d16, d17	@ t3 = (K) (P6 + P7) << 32
    132 	vmov.i64	d17, #0
    133 	vext.8		q2, q2, q2, #14
    134 	veor		d6, d6, d7
    135 	vmull.p8	q0, d26, d27		@ D = A*B
    136 	vext.8		q8, q8, q8, #12
    137 	vext.8		q3, q3, q3, #13
    138 	veor		q1, q1, q2
    139 	veor		q3, q3, q8
    140 	veor		q0, q0, q1
    141 	veor		q0, q0, q3
    142 
    143 	vst1.32		{q0}, [r0]
    144 	bx	lr		@ bx lr
    145 .align	4
    146 .Lialu:
    147 #endif
    148 	stmdb	sp!,{r4-r10,lr}
    149 	mov	r10,r0			@ reassign 1st argument
    150 	mov	r0,r3			@ r0=b1
    151 	ldr	r3,[sp,#32]		@ load b0
    152 	mov	r12,#7<<2
    153 	sub	sp,sp,#32		@ allocate tab[8]
    154 
    155 	bl	mul_1x1_ialu		@ a1b1
    156 	str	r5,[r10,#8]
    157 	str	r4,[r10,#12]
    158 
    159 	eor	r0,r0,r3		@ flip b0 and b1
    160 	 eor	r1,r1,r2		@ flip a0 and a1
    161 	eor	r3,r3,r0
    162 	 eor	r2,r2,r1
    163 	eor	r0,r0,r3
    164 	 eor	r1,r1,r2
    165 	bl	mul_1x1_ialu		@ a0b0
    166 	str	r5,[r10]
    167 	str	r4,[r10,#4]
    168 
    169 	eor	r1,r1,r2
    170 	eor	r0,r0,r3
    171 	bl	mul_1x1_ialu		@ (a1+a0)(b1+b0)
    172 	ldmia	r10,{r6-r9}
    173 	eor	r5,r5,r4
    174 	eor	r4,r4,r7
    175 	eor	r5,r5,r6
    176 	eor	r4,r4,r8
    177 	eor	r5,r5,r9
    178 	eor	r4,r4,r9
    179 	str	r4,[r10,#8]
    180 	eor	r5,r5,r4
    181 	add	sp,sp,#32		@ destroy tab[8]
    182 	str	r5,[r10,#4]
    183 
    184 #if __ARM_ARCH__>=5
    185 	ldmia	sp!,{r4-r10,pc}
    186 #else
    187 	ldmia	sp!,{r4-r10,lr}
    188 	tst	lr,#1
    189 	moveq	pc,lr			@ be binary compatible with V4, yet
    190 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    191 #endif
    192 .size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
    193 #if __ARM_ARCH__>=7
    194 .align	5
    195 .LOPENSSL_armcap:
    196 .word	OPENSSL_armcap_P-(.Lpic+8)
    197 #endif
    198 .asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro (at) openssl.org>"
    199 .align	5
    200 
    201 .comm	OPENSSL_armcap_P,4,4
    202