Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # January 2007.
     11 
     12 # Montgomery multiplication for ARMv4.
     13 #
     14 # Performance improvement naturally varies among CPU implementations
     15 # and compilers. The code was observed to provide +65-35% improvement
     16 # [depending on key length, less for longer keys] on ARM920T, and
     17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
     18 # base and compiler generated code with in-lined umull and even umlal
     19 # instructions. The latter means that this code didn't really have an 
     20 # "advantage" of utilizing some "secret" instruction.
     21 #
     22 # The code is interoperable with Thumb ISA and is rather compact, less
     23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
     24 # about decorations, ABI and instruction syntax are identical.
     25 
     26 # November 2013
     27 #
     28 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
     29 # performance improvement on Cortex-A8 is ~45-100% depending on key
     30 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
     31 # On Snapdragon S4 improvement was measured to vary from ~70% to
     32 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
     33 # rather because original integer-only code seems to perform
     34 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
     35 # different. It's being looked into, but the trouble is that
     36 # performance for vectors longer than 256 bits is actually couple
     37 # of percent worse than for integer-only code. The code is chosen
     38 # for execution on all NEON-capable processors, because gain on
     39 # others outweighs the marginal loss on Cortex-A9.
     40 
     41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     42 open STDOUT,">$output";
     43 
     44 $num="r0";	# starts as num argument, but holds &tp[num-1]
     45 $ap="r1";
     46 $bp="r2"; $bi="r2"; $rp="r2";
     47 $np="r3";
     48 $tp="r4";
     49 $aj="r5";
     50 $nj="r6";
     51 $tj="r7";
     52 $n0="r8";
     53 ###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
     54 $alo="r10";	# sl, gcc uses it to keep @GOT
     55 $ahi="r11";	# fp
     56 $nlo="r12";	# ip
     57 ###########	# r13 is stack pointer
     58 $nhi="r14";	# lr
     59 ###########	# r15 is program counter
     60 
     61 #### argument block layout relative to &tp[num-1], a.k.a. $num
     62 $_rp="$num,#12*4";
     63 # ap permanently resides in r1
     64 $_bp="$num,#13*4";
     65 # np permanently resides in r3
     66 $_n0="$num,#14*4";
     67 $_num="$num,#15*4";	$_bpend=$_num;
     68 
     69 $code=<<___;
     70 #include "arm_arch.h"
     71 
     72 .text
     73 .code	32
     74 
     75 #if __ARM_ARCH__>=7
     76 .align	5
     77 .LOPENSSL_armcap:
     78 .word	OPENSSL_armcap_P-bn_mul_mont
     79 #endif
     80 
     81 .global	bn_mul_mont
     82 .hidden	bn_mul_mont
     83 .type	bn_mul_mont,%function
     84 
     85 .align	5
     86 bn_mul_mont:
     87 	ldr	ip,[sp,#4]		@ load num
     88 	stmdb	sp!,{r0,r2}		@ sp points at argument block
     89 #if __ARM_ARCH__>=7
     90 	tst	ip,#7
     91 	bne	.Lialu
     92 	adr	r0,bn_mul_mont
     93 	ldr	r2,.LOPENSSL_armcap
     94 	ldr	r0,[r0,r2]
     95 	tst	r0,#1			@ NEON available?
     96 	ldmia	sp, {r0,r2}
     97 	beq	.Lialu
     98 	add	sp,sp,#8
     99 	b	bn_mul8x_mont_neon
    100 .align	4
    101 .Lialu:
    102 #endif
    103 	cmp	ip,#2
    104 	mov	$num,ip			@ load num
    105 	movlt	r0,#0
    106 	addlt	sp,sp,#2*4
    107 	blt	.Labrt
    108 
    109 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
    110 
    111 	mov	$num,$num,lsl#2		@ rescale $num for byte count
    112 	sub	sp,sp,$num		@ alloca(4*num)
    113 	sub	sp,sp,#4		@ +extra dword
    114 	sub	$num,$num,#4		@ "num=num-1"
    115 	add	$tp,$bp,$num		@ &bp[num-1]
    116 
    117 	add	$num,sp,$num		@ $num to point at &tp[num-1]
    118 	ldr	$n0,[$_n0]		@ &n0
    119 	ldr	$bi,[$bp]		@ bp[0]
    120 	ldr	$aj,[$ap],#4		@ ap[0],ap++
    121 	ldr	$nj,[$np],#4		@ np[0],np++
    122 	ldr	$n0,[$n0]		@ *n0
    123 	str	$tp,[$_bpend]		@ save &bp[num]
    124 
    125 	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
    126 	str	$n0,[$_n0]		@ save n0 value
    127 	mul	$n0,$alo,$n0		@ "tp[0]"*n0
    128 	mov	$nlo,#0
    129 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
    130 	mov	$tp,sp
    131 
    132 .L1st:
    133 	ldr	$aj,[$ap],#4		@ ap[j],ap++
    134 	mov	$alo,$ahi
    135 	ldr	$nj,[$np],#4		@ np[j],np++
    136 	mov	$ahi,#0
    137 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
    138 	mov	$nhi,#0
    139 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    140 	adds	$nlo,$nlo,$alo
    141 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    142 	adc	$nlo,$nhi,#0
    143 	cmp	$tp,$num
    144 	bne	.L1st
    145 
    146 	adds	$nlo,$nlo,$ahi
    147 	ldr	$tp,[$_bp]		@ restore bp
    148 	mov	$nhi,#0
    149 	ldr	$n0,[$_n0]		@ restore n0
    150 	adc	$nhi,$nhi,#0
    151 	str	$nlo,[$num]		@ tp[num-1]=
    152 	str	$nhi,[$num,#4]		@ tp[num]=
    153 
    155 .Louter:
    156 	sub	$tj,$num,sp		@ "original" $num-1 value
    157 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
    158 	ldr	$bi,[$tp,#4]!		@ *(++bp)
    159 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
    160 	ldr	$aj,[$ap,#-4]		@ ap[0]
    161 	ldr	$alo,[sp]		@ tp[0]
    162 	ldr	$nj,[$np,#-4]		@ np[0]
    163 	ldr	$tj,[sp,#4]		@ tp[1]
    164 
    165 	mov	$ahi,#0
    166 	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
    167 	str	$tp,[$_bp]		@ save bp
    168 	mul	$n0,$alo,$n0
    169 	mov	$nlo,#0
    170 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
    171 	mov	$tp,sp
    172 
    173 .Linner:
    174 	ldr	$aj,[$ap],#4		@ ap[j],ap++
    175 	adds	$alo,$ahi,$tj		@ +=tp[j]
    176 	ldr	$nj,[$np],#4		@ np[j],np++
    177 	mov	$ahi,#0
    178 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
    179 	mov	$nhi,#0
    180 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    181 	adc	$ahi,$ahi,#0
    182 	ldr	$tj,[$tp,#8]		@ tp[j+1]
    183 	adds	$nlo,$nlo,$alo
    184 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    185 	adc	$nlo,$nhi,#0
    186 	cmp	$tp,$num
    187 	bne	.Linner
    188 
    189 	adds	$nlo,$nlo,$ahi
    190 	mov	$nhi,#0
    191 	ldr	$tp,[$_bp]		@ restore bp
    192 	adc	$nhi,$nhi,#0
    193 	ldr	$n0,[$_n0]		@ restore n0
    194 	adds	$nlo,$nlo,$tj
    195 	ldr	$tj,[$_bpend]		@ restore &bp[num]
    196 	adc	$nhi,$nhi,#0
    197 	str	$nlo,[$num]		@ tp[num-1]=
    198 	str	$nhi,[$num,#4]		@ tp[num]=
    199 
    200 	cmp	$tp,$tj
    201 	bne	.Louter
    202 
    204 	ldr	$rp,[$_rp]		@ pull rp
    205 	add	$num,$num,#4		@ $num to point at &tp[num]
    206 	sub	$aj,$num,sp		@ "original" num value
    207 	mov	$tp,sp			@ "rewind" $tp
    208 	mov	$ap,$tp			@ "borrow" $ap
    209 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
    210 
    211 	subs	$tj,$tj,$tj		@ "clear" carry flag
    212 .Lsub:	ldr	$tj,[$tp],#4
    213 	ldr	$nj,[$np],#4
    214 	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
    215 	str	$tj,[$rp],#4		@ rp[j]=
    216 	teq	$tp,$num		@ preserve carry
    217 	bne	.Lsub
    218 	sbcs	$nhi,$nhi,#0		@ upmost carry
    219 	mov	$tp,sp			@ "rewind" $tp
    220 	sub	$rp,$rp,$aj		@ "rewind" $rp
    221 
    222 	and	$ap,$tp,$nhi
    223 	bic	$np,$rp,$nhi
    224 	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
    225 
    226 .Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
    227 	str	sp,[$tp],#4		@ zap tp
    228 	str	$tj,[$rp],#4
    229 	cmp	$tp,$num
    230 	bne	.Lcopy
    231 
    232 	add	sp,$num,#4		@ skip over tp[num+1]
    233 	ldmia	sp!,{r4-r12,lr}		@ restore registers
    234 	add	sp,sp,#2*4		@ skip over {r0,r2}
    235 	mov	r0,#1
    236 .Labrt:	tst	lr,#1
    237 	moveq	pc,lr			@ be binary compatible with V4, yet
    238 	bx	lr			@ interoperable with Thumb ISA:-)
    239 .size	bn_mul_mont,.-bn_mul_mont
    240 ___
    241 {
    242 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
    243 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
    244 
    245 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
    246 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
    247 my ($Z,$Temp)=("q4","q5");
    248 my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
    249 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
    250 my $zero=&Dlo($Z);
    251 my $temp=&Dlo($Temp);
    252 
    253 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
    254 my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
    255 
    256 $code.=<<___;
    257 #if __ARM_ARCH__>=7
    258 .fpu	neon
    259 
    260 .type	bn_mul8x_mont_neon,%function
    261 .align	5
    262 bn_mul8x_mont_neon:
    263 	mov	ip,sp
    264 	stmdb	sp!,{r4-r11}
    265 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
    266 	ldmia	ip,{r4-r5}		@ load rest of parameter block
    267 
    268 	sub		$toutptr,sp,#16
    269 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
    270 	sub		$toutptr,$toutptr,$num,lsl#4
    271 	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
    272 	and		$toutptr,$toutptr,#-64
    273 	vld1.32		{${M0}[0]}, [$n0,:32]
    274 	mov		sp,$toutptr			@ alloca
    275 	veor		$zero,$zero,$zero
    276 	subs		$inner,$num,#8
    277 	vzip.16		$Bi,$zero
    278 
    279 	vmull.u32	$A0xB,$Bi,${A0}[0]
    280 	vmull.u32	$A1xB,$Bi,${A0}[1]
    281 	vmull.u32	$A2xB,$Bi,${A1}[0]
    282 	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
    283 	vmull.u32	$A3xB,$Bi,${A1}[1]
    284 
    285 	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
    286 	veor		$zero,$zero,$zero
    287 	vmul.u32	$Ni,$temp,$M0
    288 
    289 	vmull.u32	$A4xB,$Bi,${A2}[0]
    290 	 vld1.32	{$N0-$N3}, [$nptr]!
    291 	vmull.u32	$A5xB,$Bi,${A2}[1]
    292 	vmull.u32	$A6xB,$Bi,${A3}[0]
    293 	vzip.16		$Ni,$zero
    294 	vmull.u32	$A7xB,$Bi,${A3}[1]
    295 
    296 	bne	.LNEON_1st
    297 
    298 	@ special case for num=8, everything is in register bank...
    299 
    300 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    301 	sub		$outer,$num,#1
    302 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    303 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    304 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    305 
    306 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    307 	vmov		$Temp,$A0xB
    308 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    309 	vmov		$A0xB,$A1xB
    310 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    311 	vmov		$A1xB,$A2xB
    312 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    313 	vmov		$A2xB,$A3xB
    314 	vmov		$A3xB,$A4xB
    315 	vshr.u64	$temp,$temp,#16
    316 	vmov		$A4xB,$A5xB
    317 	vmov		$A5xB,$A6xB
    318 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    319 	vmov		$A6xB,$A7xB
    320 	veor		$A7xB,$A7xB
    321 	vshr.u64	$temp,$temp,#16
    322 
    323 	b	.LNEON_outer8
    324 
    325 .align	4
    326 .LNEON_outer8:
    327 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
    328 	veor		$zero,$zero,$zero
    329 	vzip.16		$Bi,$zero
    330 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    331 
    332 	vmlal.u32	$A0xB,$Bi,${A0}[0]
    333 	vmlal.u32	$A1xB,$Bi,${A0}[1]
    334 	vmlal.u32	$A2xB,$Bi,${A1}[0]
    335 	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
    336 	vmlal.u32	$A3xB,$Bi,${A1}[1]
    337 
    338 	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
    339 	veor		$zero,$zero,$zero
    340 	subs		$outer,$outer,#1
    341 	vmul.u32	$Ni,$temp,$M0
    342 
    343 	vmlal.u32	$A4xB,$Bi,${A2}[0]
    344 	vmlal.u32	$A5xB,$Bi,${A2}[1]
    345 	vmlal.u32	$A6xB,$Bi,${A3}[0]
    346 	vzip.16		$Ni,$zero
    347 	vmlal.u32	$A7xB,$Bi,${A3}[1]
    348 
    349 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    350 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    351 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    352 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    353 
    354 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    355 	vmov		$Temp,$A0xB
    356 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    357 	vmov		$A0xB,$A1xB
    358 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    359 	vmov		$A1xB,$A2xB
    360 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    361 	vmov		$A2xB,$A3xB
    362 	vmov		$A3xB,$A4xB
    363 	vshr.u64	$temp,$temp,#16
    364 	vmov		$A4xB,$A5xB
    365 	vmov		$A5xB,$A6xB
    366 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    367 	vmov		$A6xB,$A7xB
    368 	veor		$A7xB,$A7xB
    369 	vshr.u64	$temp,$temp,#16
    370 
    371 	bne	.LNEON_outer8
    372 
    373 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    374 	mov		$toutptr,sp
    375 	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
    376 	mov		$inner,$num
    377 	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
    378 	add		$tinptr,sp,#16
    379 	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
    380 	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
    381 
    382 	b	.LNEON_tail2
    383 
    384 .align	4
    385 .LNEON_1st:
    386 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    387 	 vld1.32	{$A0-$A3}, [$aptr]!
    388 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    389 	subs		$inner,$inner,#8
    390 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    391 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    392 
    393 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    394 	 vld1.32	{$N0-$N1}, [$nptr]!
    395 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    396 	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
    397 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    398 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    399 	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
    400 
    401 	vmull.u32	$A0xB,$Bi,${A0}[0]
    402 	 vld1.32	{$N2-$N3}, [$nptr]!
    403 	vmull.u32	$A1xB,$Bi,${A0}[1]
    404 	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
    405 	vmull.u32	$A2xB,$Bi,${A1}[0]
    406 	vmull.u32	$A3xB,$Bi,${A1}[1]
    407 	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
    408 
    409 	vmull.u32	$A4xB,$Bi,${A2}[0]
    410 	vmull.u32	$A5xB,$Bi,${A2}[1]
    411 	vmull.u32	$A6xB,$Bi,${A3}[0]
    412 	vmull.u32	$A7xB,$Bi,${A3}[1]
    413 
    414 	bne	.LNEON_1st
    415 
    416 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    417 	add		$tinptr,sp,#16
    418 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    419 	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
    420 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    421 	 vld1.64	{$Temp}, [sp,:128]
    422 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    423 	sub		$outer,$num,#1
    424 
    425 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    426 	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
    427 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    428 	vshr.u64	$temp,$temp,#16
    429 	 vld1.64	{$A0xB},       [$tinptr, :128]!
    430 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    431 	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
    432 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    433 
    434 	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
    435 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    436 	veor		$Z,$Z,$Z
    437 	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
    438 	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
    439 	vst1.64		{$Z},          [$toutptr,:128]
    440 	vshr.u64	$temp,$temp,#16
    441 
    442 	b		.LNEON_outer
    443 
    444 .align	4
    445 .LNEON_outer:
    446 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
    447 	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
    448 	vld1.32		{$A0-$A3},  [$aptr]!
    449 	veor		$zero,$zero,$zero
    450 	mov		$toutptr,sp
    451 	vzip.16		$Bi,$zero
    452 	sub		$inner,$num,#8
    453 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    454 
    455 	vmlal.u32	$A0xB,$Bi,${A0}[0]
    456 	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
    457 	vmlal.u32	$A1xB,$Bi,${A0}[1]
    458 	vmlal.u32	$A2xB,$Bi,${A1}[0]
    459 	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
    460 	vmlal.u32	$A3xB,$Bi,${A1}[1]
    461 
    462 	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
    463 	veor		$zero,$zero,$zero
    464 	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
    465 	 vld1.64	{$A7xB},[$tinptr,:128]!
    466 	vmul.u32	$Ni,$temp,$M0
    467 
    468 	vmlal.u32	$A4xB,$Bi,${A2}[0]
    469 	 vld1.32	{$N0-$N3}, [$nptr]!
    470 	vmlal.u32	$A5xB,$Bi,${A2}[1]
    471 	vmlal.u32	$A6xB,$Bi,${A3}[0]
    472 	vzip.16		$Ni,$zero
    473 	vmlal.u32	$A7xB,$Bi,${A3}[1]
    474 
    475 .LNEON_inner:
    476 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    477 	 vld1.32	{$A0-$A3}, [$aptr]!
    478 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    479 	 subs		$inner,$inner,#8
    480 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    481 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    482 	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
    483 
    484 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    485 	 vld1.64	{$A0xB},       [$tinptr, :128]!
    486 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    487 	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
    488 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    489 	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
    490 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    491 	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
    492 
    493 	vmlal.u32	$A0xB,$Bi,${A0}[0]
    494 	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
    495 	vmlal.u32	$A1xB,$Bi,${A0}[1]
    496 	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
    497 	vmlal.u32	$A2xB,$Bi,${A1}[0]
    498 	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
    499 	vmlal.u32	$A3xB,$Bi,${A1}[1]
    500 	 vld1.32	{$N0-$N3}, [$nptr]!
    501 
    502 	vmlal.u32	$A4xB,$Bi,${A2}[0]
    503 	 vld1.64	{$A7xB},       [$tinptr, :128]!
    504 	vmlal.u32	$A5xB,$Bi,${A2}[1]
    505 	vmlal.u32	$A6xB,$Bi,${A3}[0]
    506 	vmlal.u32	$A7xB,$Bi,${A3}[1]
    507 
    508 	bne	.LNEON_inner
    509 
    510 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    511 	add		$tinptr,sp,#16
    512 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    513 	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
    514 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    515 	 vld1.64	{$Temp}, [sp,:128]
    516 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    517 	subs		$outer,$outer,#1
    518 
    519 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    520 	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
    521 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    522 	 vld1.64	{$A0xB},       [$tinptr, :128]!
    523 	vshr.u64	$temp,$temp,#16
    524 	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
    525 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    526 	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
    527 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    528 
    529 	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
    530 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    531 	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
    532 	vshr.u64	$temp,$temp,#16
    533 
    534 	bne	.LNEON_outer
    535 
    536 	mov		$toutptr,sp
    537 	mov		$inner,$num
    538 
    539 .LNEON_tail:
    540 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    541 	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
    542 	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
    543 	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
    544 	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
    545 	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
    546 	vld1.64		{$A7xB},       [$tinptr, :128]!
    547 	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
    548 
    549 .LNEON_tail2:
    550 	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
    551 	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
    552 	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
    553 	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
    554 	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
    555 	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
    556 
    557 	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
    558 	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
    559 	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
    560 	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
    561 	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
    562 	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
    563 
    564 	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
    565 	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
    566 	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
    567 	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
    568 	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
    569 	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
    570 
    571 	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
    572 	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
    573 	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
    574 	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
    575 	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
    576 	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
    577 
    578 	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
    579 	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
    580 	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
    581 	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
    582 	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
    583 	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
    584 
    585 	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
    586 	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
    587 	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
    588 	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
    589 	vld1.64		{$A0xB}, [$tinptr, :128]!
    590 	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
    591 	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
    592 
    593 	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
    594 	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
    595 	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
    596 	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
    597 	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
    598 	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
    599 	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
    600 	subs		$inner,$inner,#8
    601 	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
    602 
    603 	bne	.LNEON_tail
    604 
    605 	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
    606 	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
    607 	subs	$aptr,sp,#0				@ clear carry flag
    608 	add	$bptr,sp,$num,lsl#2
    609 
    610 .LNEON_sub:
    611 	ldmia	$aptr!, {r4-r7}
    612 	ldmia	$nptr!, {r8-r11}
    613 	sbcs	r8, r4,r8
    614 	sbcs	r9, r5,r9
    615 	sbcs	r10,r6,r10
    616 	sbcs	r11,r7,r11
    617 	teq	$aptr,$bptr				@ preserves carry
    618 	stmia	$rptr!, {r8-r11}
    619 	bne	.LNEON_sub
    620 
    621 	ldr	r10, [$aptr]				@ load top-most bit
    622 	veor	q0,q0,q0
    623 	sub	r11,$bptr,sp				@ this is num*4
    624 	veor	q1,q1,q1
    625 	mov	$aptr,sp
    626 	sub	$rptr,$rptr,r11				@ rewind $rptr
    627 	mov	$nptr,$bptr				@ second 3/4th of frame
    628 	sbcs	r10,r10,#0				@ result is carry flag
    629 
    630 .LNEON_copy_n_zap:
    631 	ldmia	$aptr!, {r4-r7}
    632 	ldmia	$rptr,  {r8-r11}
    633 	movcc	r8, r4
    634 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
    635 	movcc	r9, r5
    636 	movcc	r10,r6
    637 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
    638 	movcc	r11,r7
    639 	ldmia	$aptr, {r4-r7}
    640 	stmia	$rptr!, {r8-r11}
    641 	sub	$aptr,$aptr,#16
    642 	ldmia	$rptr, {r8-r11}
    643 	movcc	r8, r4
    644 	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
    645 	movcc	r9, r5
    646 	movcc	r10,r6
    647 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
    648 	movcc	r11,r7
    649 	teq	$aptr,$bptr				@ preserves carry
    650 	stmia	$rptr!, {r8-r11}
    651 	bne	.LNEON_copy_n_zap
    652 
    653 	sub	sp,ip,#96
    654         vldmia  sp!,{d8-d15}
    655         ldmia   sp!,{r4-r11}
    656 	bx	lr
    657 .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
    658 #endif
    659 ___
    660 }
    661 $code.=<<___;
    662 .asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    663 .align	2
    664 #if __ARM_ARCH__>=7
    665 .comm	OPENSSL_armcap_P,4,4
    666 #endif
    667 ___
    668 
    669 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    670 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    671 print $code;
    672 close STDOUT;
    673