Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # May 2011
     11 #
     12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
     13 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
     14 # C for the time being... Except that it has two code paths: pure
     15 # integer code suitable for any ARMv4 and later CPU and NEON code
     16 # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
     17 # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
     18 # faster than compiler-generated code. For ECDH and ECDSA verify (but
     19 # not for ECDSA sign) it means 25%-45% improvement depending on key
     20 # length, more for longer keys. Even though NEON 1x1 multiplication
     21 # runs in even less cycles, ~30, improvement is measurable only on
     22 # longer keys. One has to optimize code elsewhere to get NEON glow...
     23 
     24 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     25 open STDOUT,">$output";
     26 
     27 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
     28 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
     29 sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
     30 
     31 $code=<<___;
     32 #include "arm_arch.h"
     33 
     34 .text
     35 .code	32
     36 
     37 #if __ARM_ARCH__>=7
     38 .fpu	neon
     39 
     40 .type	mul_1x1_neon,%function
     41 .align	5
     42 mul_1x1_neon:
     43 	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
     44 	vmull.p8	`&Q("d0")`,d16,d17	@ abb
     45 	vshl.u64	`&Dlo("q2")`,d16,#16
     46 	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8bb
     47 	vshl.u64	`&Dlo("q3")`,d16,#24
     48 	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16bb
     49 	vshr.u64	`&Dlo("q1")`,#8
     50 	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24bb
     51 	vshl.u64	`&Dhi("q1")`,#24
     52 	veor		d0,`&Dlo("q1")`
     53 	vshr.u64	`&Dlo("q2")`,#16
     54 	veor		d0,`&Dhi("q1")`
     55 	vshl.u64	`&Dhi("q2")`,#16
     56 	veor		d0,`&Dlo("q2")`
     57 	vshr.u64	`&Dlo("q3")`,#24
     58 	veor		d0,`&Dhi("q2")`
     59 	vshl.u64	`&Dhi("q3")`,#8
     60 	veor		d0,`&Dlo("q3")`
     61 	veor		d0,`&Dhi("q3")`
     62 	bx	lr
     63 .size	mul_1x1_neon,.-mul_1x1_neon
     64 #endif
     65 ___
     66 ################
     67 # private interface to mul_1x1_ialu
     68 #
     69 $a="r1";
     70 $b="r0";
     71 
     72 ($a0,$a1,$a2,$a12,$a4,$a14)=
     73 ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
     74 
     75 $mask="r12";
     76 
     77 $code.=<<___;
     78 .type	mul_1x1_ialu,%function
     79 .align	5
     80 mul_1x1_ialu:
     81 	mov	$a0,#0
     82 	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
     83 	str	$a0,[sp,#0]		@ tab[0]=0
     84 	add	$a2,$a1,$a1		@ a2=a1<<1
     85 	str	$a1,[sp,#4]		@ tab[1]=a1
     86 	eor	$a12,$a1,$a2		@ a1^a2
     87 	str	$a2,[sp,#8]		@ tab[2]=a2
     88 	mov	$a4,$a1,lsl#2		@ a4=a1<<2
     89 	str	$a12,[sp,#12]		@ tab[3]=a1^a2
     90 	eor	$a14,$a1,$a4		@ a1^a4
     91 	str	$a4,[sp,#16]		@ tab[4]=a4
     92 	eor	$a0,$a2,$a4		@ a2^a4
     93 	str	$a14,[sp,#20]		@ tab[5]=a1^a4
     94 	eor	$a12,$a12,$a4		@ a1^a2^a4
     95 	str	$a0,[sp,#24]		@ tab[6]=a2^a4
     96 	and	$i0,$mask,$b,lsl#2
     97 	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
     98 
     99 	and	$i1,$mask,$b,lsr#1
    100 	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
    101 	and	$i0,$mask,$b,lsr#4
    102 	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
    103 	and	$i1,$mask,$b,lsr#7
    104 	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
    105 	eor	$lo,$lo,$t1,lsl#3	@ stall
    106 	mov	$hi,$t1,lsr#29
    107 	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
    108 
    109 	and	$i0,$mask,$b,lsr#10
    110 	eor	$lo,$lo,$t0,lsl#6
    111 	eor	$hi,$hi,$t0,lsr#26
    112 	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
    113 
    114 	and	$i1,$mask,$b,lsr#13
    115 	eor	$lo,$lo,$t1,lsl#9
    116 	eor	$hi,$hi,$t1,lsr#23
    117 	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
    118 
    119 	and	$i0,$mask,$b,lsr#16
    120 	eor	$lo,$lo,$t0,lsl#12
    121 	eor	$hi,$hi,$t0,lsr#20
    122 	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
    123 
    124 	and	$i1,$mask,$b,lsr#19
    125 	eor	$lo,$lo,$t1,lsl#15
    126 	eor	$hi,$hi,$t1,lsr#17
    127 	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
    128 
    129 	and	$i0,$mask,$b,lsr#22
    130 	eor	$lo,$lo,$t0,lsl#18
    131 	eor	$hi,$hi,$t0,lsr#14
    132 	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
    133 
    134 	and	$i1,$mask,$b,lsr#25
    135 	eor	$lo,$lo,$t1,lsl#21
    136 	eor	$hi,$hi,$t1,lsr#11
    137 	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
    138 
    139 	tst	$a,#1<<30
    140 	and	$i0,$mask,$b,lsr#28
    141 	eor	$lo,$lo,$t0,lsl#24
    142 	eor	$hi,$hi,$t0,lsr#8
    143 	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
    144 
    145 	eorne	$lo,$lo,$b,lsl#30
    146 	eorne	$hi,$hi,$b,lsr#2
    147 	tst	$a,#1<<31
    148 	eor	$lo,$lo,$t1,lsl#27
    149 	eor	$hi,$hi,$t1,lsr#5
    150 	eorne	$lo,$lo,$b,lsl#31
    151 	eorne	$hi,$hi,$b,lsr#1
    152 	eor	$lo,$lo,$t0,lsl#30
    153 	eor	$hi,$hi,$t0,lsr#2
    154 
    155 	mov	pc,lr
    156 .size	mul_1x1_ialu,.-mul_1x1_ialu
    157 ___
    158 ################
    159 # void	bn_GF2m_mul_2x2(BN_ULONG *r,
    160 #	BN_ULONG a1,BN_ULONG a0,
    161 #	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0b1b0
    162 
    163 ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
    164 
    165 $code.=<<___;
    166 .global	bn_GF2m_mul_2x2
    167 .type	bn_GF2m_mul_2x2,%function
    168 .align	5
    169 bn_GF2m_mul_2x2:
    170 #if __ARM_ARCH__>=7
    171 	ldr	r12,.LOPENSSL_armcap
    172 .Lpic:	ldr	r12,[pc,r12]
    173 	tst	r12,#1
    174 	beq	.Lialu
    175 
    176 	veor	$A1,$A1
    177 	vmov.32	$B1,r3,r3		@ two copies of b1
    178 	vmov.32	${A1}[0],r1		@ a1
    179 
    180 	veor	$A0,$A0
    181 	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
    182 	vmov.32	${A0}[0],r2		@ a0
    183 	mov	r12,lr
    184 
    185 	vmov	d16,$A1
    186 	vmov	d17,$B1
    187 	bl	mul_1x1_neon		@ a1b1
    188 	vmov	$A1B1,d0
    189 
    190 	vmov	d16,$A0
    191 	vmov	d17,$B0
    192 	bl	mul_1x1_neon		@ a0b0
    193 	vmov	$A0B0,d0
    194 
    195 	veor	d16,$A0,$A1
    196 	veor	d17,$B0,$B1
    197 	veor	$A0,$A0B0,$A1B1
    198 	bl	mul_1x1_neon		@ (a0+a1)(b0+b1)
    199 
    200 	veor	d0,$A0			@ (a0+a1)(b0+b1)-a0b0-a1b1
    201 	vshl.u64 d1,d0,#32
    202 	vshr.u64 d0,d0,#32
    203 	veor	$A0B0,d1
    204 	veor	$A1B1,d0
    205 	vst1.32	{${A0B0}[0]},[r0,:32]!
    206 	vst1.32	{${A0B0}[1]},[r0,:32]!
    207 	vst1.32	{${A1B1}[0]},[r0,:32]!
    208 	vst1.32	{${A1B1}[1]},[r0,:32]
    209 	bx	r12
    210 .align	4
    211 .Lialu:
    212 #endif
    213 ___
    214 $ret="r10";	# reassigned 1st argument
    215 $code.=<<___;
    216 	stmdb	sp!,{r4-r10,lr}
    217 	mov	$ret,r0			@ reassign 1st argument
    218 	mov	$b,r3			@ $b=b1
    219 	ldr	r3,[sp,#32]		@ load b0
    220 	mov	$mask,#7<<2
    221 	sub	sp,sp,#32		@ allocate tab[8]
    222 
    223 	bl	mul_1x1_ialu		@ a1b1
    224 	str	$lo,[$ret,#8]
    225 	str	$hi,[$ret,#12]
    226 
    227 	eor	$b,$b,r3		@ flip b0 and b1
    228 	 eor	$a,$a,r2		@ flip a0 and a1
    229 	eor	r3,r3,$b
    230 	 eor	r2,r2,$a
    231 	eor	$b,$b,r3
    232 	 eor	$a,$a,r2
    233 	bl	mul_1x1_ialu		@ a0b0
    234 	str	$lo,[$ret]
    235 	str	$hi,[$ret,#4]
    236 
    237 	eor	$a,$a,r2
    238 	eor	$b,$b,r3
    239 	bl	mul_1x1_ialu		@ (a1+a0)(b1+b0)
    240 ___
    241 @r=map("r$_",(6..9));
    242 $code.=<<___;
    243 	ldmia	$ret,{@r[0]-@r[3]}
    244 	eor	$lo,$lo,$hi
    245 	eor	$hi,$hi,@r[1]
    246 	eor	$lo,$lo,@r[0]
    247 	eor	$hi,$hi,@r[2]
    248 	eor	$lo,$lo,@r[3]
    249 	eor	$hi,$hi,@r[3]
    250 	str	$hi,[$ret,#8]
    251 	eor	$lo,$lo,$hi
    252 	add	sp,sp,#32		@ destroy tab[8]
    253 	str	$lo,[$ret,#4]
    254 
    255 #if __ARM_ARCH__>=5
    256 	ldmia	sp!,{r4-r10,pc}
    257 #else
    258 	ldmia	sp!,{r4-r10,lr}
    259 	tst	lr,#1
    260 	moveq	pc,lr			@ be binary compatible with V4, yet
    261 	bx	lr			@ interoperable with Thumb ISA:-)
    262 #endif
    263 .size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
    264 #if __ARM_ARCH__>=7
    265 .align	5
    266 .LOPENSSL_armcap:
    267 .word	OPENSSL_armcap_P-(.Lpic+8)
    268 #endif
    269 .asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    270 .align	5
    271 
    272 .comm	OPENSSL_armcap_P,4,4
    273 ___
    274 
    275 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    276 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
    277 print $code;
    278 close STDOUT;   # enforce flush
    279