Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # January 2007.
     11 
     12 # Montgomery multiplication for ARMv4.
     13 #
     14 # Performance improvement naturally varies among CPU implementations
     15 # and compilers. The code was observed to provide +65-35% improvement
     16 # [depending on key length, less for longer keys] on ARM920T, and
     17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
     18 # base and compiler generated code with in-lined umull and even umlal
     19 # instructions. The latter means that this code didn't really have an 
     20 # "advantage" of utilizing some "secret" instruction.
     21 #
     22 # The code is interoperable with Thumb ISA and is rather compact, less
     23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
     24 # about decorations, ABI and instruction syntax are identical.
     25 
     26 # November 2013
     27 #
     28 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
     29 # performance improvement on Cortex-A8 is ~45-100% depending on key
     30 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
     31 # On Snapdragon S4 improvement was measured to vary from ~70% to
     32 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
     33 # rather because original integer-only code seems to perform
     34 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
     35 # different. It's being looked into, but the trouble is that
     36 # performance for vectors longer than 256 bits is actually couple
     37 # of percent worse than for integer-only code. The code is chosen
     38 # for execution on all NEON-capable processors, because gain on
     39 # others outweighs the marginal loss on Cortex-A9.
     40 
     41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     42 open STDOUT,">$output";
     43 
     44 $num="r0";	# starts as num argument, but holds &tp[num-1]
     45 $ap="r1";
     46 $bp="r2"; $bi="r2"; $rp="r2";
     47 $np="r3";
     48 $tp="r4";
     49 $aj="r5";
     50 $nj="r6";
     51 $tj="r7";
     52 $n0="r8";
     53 ###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
     54 $alo="r10";	# sl, gcc uses it to keep @GOT
     55 $ahi="r11";	# fp
     56 $nlo="r12";	# ip
     57 ###########	# r13 is stack pointer
     58 $nhi="r14";	# lr
     59 ###########	# r15 is program counter
     60 
     61 #### argument block layout relative to &tp[num-1], a.k.a. $num
     62 $_rp="$num,#12*4";
     63 # ap permanently resides in r1
     64 $_bp="$num,#13*4";
     65 # np permanently resides in r3
     66 $_n0="$num,#14*4";
     67 $_num="$num,#15*4";	$_bpend=$_num;
     68 
     69 $code=<<___;
     70 #include "arm_arch.h"
     71 
     72 .text
     73 .code	32
     74 
     75 #if __ARM_ARCH__>=7
     76 .align	5
     77 .LOPENSSL_armcap:
     78 .word	OPENSSL_armcap_P-bn_mul_mont
     79 #endif
     80 
     81 .global	bn_mul_mont
     82 .type	bn_mul_mont,%function
     83 
     84 .align	5
     85 bn_mul_mont:
     86 	ldr	ip,[sp,#4]		@ load num
     87 	stmdb	sp!,{r0,r2}		@ sp points at argument block
     88 #if __ARM_ARCH__>=7
     89 	tst	ip,#7
     90 	bne	.Lialu
     91 	adr	r0,bn_mul_mont
     92 	ldr	r2,.LOPENSSL_armcap
     93 	ldr	r0,[r0,r2]
     94 	tst	r0,#1			@ NEON available?
     95 	ldmia	sp, {r0,r2}
     96 	beq	.Lialu
     97 	add	sp,sp,#8
     98 	b	bn_mul8x_mont_neon
     99 .align	4
    100 .Lialu:
    101 #endif
    102 	cmp	ip,#2
    103 	mov	$num,ip			@ load num
    104 	movlt	r0,#0
    105 	addlt	sp,sp,#2*4
    106 	blt	.Labrt
    107 
    108 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
    109 
    110 	mov	$num,$num,lsl#2		@ rescale $num for byte count
    111 	sub	sp,sp,$num		@ alloca(4*num)
    112 	sub	sp,sp,#4		@ +extra dword
    113 	sub	$num,$num,#4		@ "num=num-1"
    114 	add	$tp,$bp,$num		@ &bp[num-1]
    115 
    116 	add	$num,sp,$num		@ $num to point at &tp[num-1]
    117 	ldr	$n0,[$_n0]		@ &n0
    118 	ldr	$bi,[$bp]		@ bp[0]
    119 	ldr	$aj,[$ap],#4		@ ap[0],ap++
    120 	ldr	$nj,[$np],#4		@ np[0],np++
    121 	ldr	$n0,[$n0]		@ *n0
    122 	str	$tp,[$_bpend]		@ save &bp[num]
    123 
    124 	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
    125 	str	$n0,[$_n0]		@ save n0 value
    126 	mul	$n0,$alo,$n0		@ "tp[0]"*n0
    127 	mov	$nlo,#0
    128 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
    129 	mov	$tp,sp
    130 
    131 .L1st:
    132 	ldr	$aj,[$ap],#4		@ ap[j],ap++
    133 	mov	$alo,$ahi
    134 	ldr	$nj,[$np],#4		@ np[j],np++
    135 	mov	$ahi,#0
    136 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
    137 	mov	$nhi,#0
    138 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    139 	adds	$nlo,$nlo,$alo
    140 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    141 	adc	$nlo,$nhi,#0
    142 	cmp	$tp,$num
    143 	bne	.L1st
    144 
    145 	adds	$nlo,$nlo,$ahi
    146 	ldr	$tp,[$_bp]		@ restore bp
    147 	mov	$nhi,#0
    148 	ldr	$n0,[$_n0]		@ restore n0
    149 	adc	$nhi,$nhi,#0
    150 	str	$nlo,[$num]		@ tp[num-1]=
    151 	str	$nhi,[$num,#4]		@ tp[num]=
    152 
    154 .Louter:
    155 	sub	$tj,$num,sp		@ "original" $num-1 value
    156 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
    157 	ldr	$bi,[$tp,#4]!		@ *(++bp)
    158 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
    159 	ldr	$aj,[$ap,#-4]		@ ap[0]
    160 	ldr	$alo,[sp]		@ tp[0]
    161 	ldr	$nj,[$np,#-4]		@ np[0]
    162 	ldr	$tj,[sp,#4]		@ tp[1]
    163 
    164 	mov	$ahi,#0
    165 	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
    166 	str	$tp,[$_bp]		@ save bp
    167 	mul	$n0,$alo,$n0
    168 	mov	$nlo,#0
    169 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
    170 	mov	$tp,sp
    171 
    172 .Linner:
    173 	ldr	$aj,[$ap],#4		@ ap[j],ap++
    174 	adds	$alo,$ahi,$tj		@ +=tp[j]
    175 	ldr	$nj,[$np],#4		@ np[j],np++
    176 	mov	$ahi,#0
    177 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
    178 	mov	$nhi,#0
    179 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    180 	adc	$ahi,$ahi,#0
    181 	ldr	$tj,[$tp,#8]		@ tp[j+1]
    182 	adds	$nlo,$nlo,$alo
    183 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    184 	adc	$nlo,$nhi,#0
    185 	cmp	$tp,$num
    186 	bne	.Linner
    187 
    188 	adds	$nlo,$nlo,$ahi
    189 	mov	$nhi,#0
    190 	ldr	$tp,[$_bp]		@ restore bp
    191 	adc	$nhi,$nhi,#0
    192 	ldr	$n0,[$_n0]		@ restore n0
    193 	adds	$nlo,$nlo,$tj
    194 	ldr	$tj,[$_bpend]		@ restore &bp[num]
    195 	adc	$nhi,$nhi,#0
    196 	str	$nlo,[$num]		@ tp[num-1]=
    197 	str	$nhi,[$num,#4]		@ tp[num]=
    198 
    199 	cmp	$tp,$tj
    200 	bne	.Louter
    201 
    203 	ldr	$rp,[$_rp]		@ pull rp
    204 	add	$num,$num,#4		@ $num to point at &tp[num]
    205 	sub	$aj,$num,sp		@ "original" num value
    206 	mov	$tp,sp			@ "rewind" $tp
    207 	mov	$ap,$tp			@ "borrow" $ap
    208 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
    209 
    210 	subs	$tj,$tj,$tj		@ "clear" carry flag
    211 .Lsub:	ldr	$tj,[$tp],#4
    212 	ldr	$nj,[$np],#4
    213 	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
    214 	str	$tj,[$rp],#4		@ rp[j]=
    215 	teq	$tp,$num		@ preserve carry
    216 	bne	.Lsub
    217 	sbcs	$nhi,$nhi,#0		@ upmost carry
    218 	mov	$tp,sp			@ "rewind" $tp
    219 	sub	$rp,$rp,$aj		@ "rewind" $rp
    220 
    221 	and	$ap,$tp,$nhi
    222 	bic	$np,$rp,$nhi
    223 	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
    224 
    225 .Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
    226 	str	sp,[$tp],#4		@ zap tp
    227 	str	$tj,[$rp],#4
    228 	cmp	$tp,$num
    229 	bne	.Lcopy
    230 
    231 	add	sp,$num,#4		@ skip over tp[num+1]
    232 	ldmia	sp!,{r4-r12,lr}		@ restore registers
    233 	add	sp,sp,#2*4		@ skip over {r0,r2}
    234 	mov	r0,#1
    235 .Labrt:
    236 #if __ARM_ARCH__>=5
    237 	ret				@ bx lr
    238 #else
    239 	tst	lr,#1
    240 	moveq	pc,lr			@ be binary compatible with V4, yet
    241 	bx	lr			@ interoperable with Thumb ISA:-)
    242 #endif
    243 .size	bn_mul_mont,.-bn_mul_mont
    244 ___
    245 {
    246 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
    247 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
    248 
    249 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
    250 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
    251 my ($Z,$Temp)=("q4","q5");
    252 my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
    253 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
    254 my $zero=&Dlo($Z);
    255 my $temp=&Dlo($Temp);
    256 
    257 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
    258 my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
    259 
    260 $code.=<<___;
    261 #if __ARM_ARCH__>=7
    262 .fpu	neon
    263 
    264 .type	bn_mul8x_mont_neon,%function
    265 .align	5
    266 bn_mul8x_mont_neon:
    267 	mov	ip,sp
    268 	stmdb	sp!,{r4-r11}
    269 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
    270 	ldmia	ip,{r4-r5}		@ load rest of parameter block
    271 
    272 	sub		$toutptr,sp,#16
    273 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
    274 	sub		$toutptr,$toutptr,$num,lsl#4
    275 	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
    276 	and		$toutptr,$toutptr,#-64
    277 	vld1.32		{${M0}[0]}, [$n0,:32]
    278 	mov		sp,$toutptr			@ alloca
    279 	veor		$zero,$zero,$zero
    280 	subs		$inner,$num,#8
    281 	vzip.16		$Bi,$zero
    282 
    283 	vmull.u32	$A0xB,$Bi,${A0}[0]
    284 	vmull.u32	$A1xB,$Bi,${A0}[1]
    285 	vmull.u32	$A2xB,$Bi,${A1}[0]
    286 	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
    287 	vmull.u32	$A3xB,$Bi,${A1}[1]
    288 
    289 	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
    290 	veor		$zero,$zero,$zero
    291 	vmul.u32	$Ni,$temp,$M0
    292 
    293 	vmull.u32	$A4xB,$Bi,${A2}[0]
    294 	 vld1.32	{$N0-$N3}, [$nptr]!
    295 	vmull.u32	$A5xB,$Bi,${A2}[1]
    296 	vmull.u32	$A6xB,$Bi,${A3}[0]
    297 	vzip.16		$Ni,$zero
    298 	vmull.u32	$A7xB,$Bi,${A3}[1]
    299 
    300 	bne	.LNEON_1st
    301 
    302 	@ special case for num=8, everything is in register bank...
    303 
    304 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    305 	sub		$outer,$num,#1
    306 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    307 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    308 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    309 
    310 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    311 	vmov		$Temp,$A0xB
    312 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    313 	vmov		$A0xB,$A1xB
    314 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    315 	vmov		$A1xB,$A2xB
    316 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    317 	vmov		$A2xB,$A3xB
    318 	vmov		$A3xB,$A4xB
    319 	vshr.u64	$temp,$temp,#16
    320 	vmov		$A4xB,$A5xB
    321 	vmov		$A5xB,$A6xB
    322 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    323 	vmov		$A6xB,$A7xB
    324 	veor		$A7xB,$A7xB
    325 	vshr.u64	$temp,$temp,#16
    326 
    327 	b	.LNEON_outer8
    328 
    329 .align	4
    330 .LNEON_outer8:
    331 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
    332 	veor		$zero,$zero,$zero
    333 	vzip.16		$Bi,$zero
    334 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    335 
    336 	vmlal.u32	$A0xB,$Bi,${A0}[0]
    337 	vmlal.u32	$A1xB,$Bi,${A0}[1]
    338 	vmlal.u32	$A2xB,$Bi,${A1}[0]
    339 	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
    340 	vmlal.u32	$A3xB,$Bi,${A1}[1]
    341 
    342 	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
    343 	veor		$zero,$zero,$zero
    344 	subs		$outer,$outer,#1
    345 	vmul.u32	$Ni,$temp,$M0
    346 
    347 	vmlal.u32	$A4xB,$Bi,${A2}[0]
    348 	vmlal.u32	$A5xB,$Bi,${A2}[1]
    349 	vmlal.u32	$A6xB,$Bi,${A3}[0]
    350 	vzip.16		$Ni,$zero
    351 	vmlal.u32	$A7xB,$Bi,${A3}[1]
    352 
    353 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    354 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    355 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    356 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    357 
    358 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    359 	vmov		$Temp,$A0xB
    360 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    361 	vmov		$A0xB,$A1xB
    362 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    363 	vmov		$A1xB,$A2xB
    364 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    365 	vmov		$A2xB,$A3xB
    366 	vmov		$A3xB,$A4xB
    367 	vshr.u64	$temp,$temp,#16
    368 	vmov		$A4xB,$A5xB
    369 	vmov		$A5xB,$A6xB
    370 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    371 	vmov		$A6xB,$A7xB
    372 	veor		$A7xB,$A7xB
    373 	vshr.u64	$temp,$temp,#16
    374 
    375 	bne	.LNEON_outer8
    376 
    377 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    378 	mov		$toutptr,sp
    379 	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
    380 	mov		$inner,$num
    381 	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
    382 	add		$tinptr,sp,#16
    383 	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
    384 	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
    385 
    386 	b	.LNEON_tail2
    387 
    388 .align	4
    389 .LNEON_1st:
    390 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    391 	 vld1.32	{$A0-$A3}, [$aptr]!
    392 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    393 	subs		$inner,$inner,#8
    394 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    395 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    396 
    397 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    398 	 vld1.32	{$N0-$N1}, [$nptr]!
    399 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    400 	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
    401 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    402 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    403 	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
    404 
    405 	vmull.u32	$A0xB,$Bi,${A0}[0]
    406 	 vld1.32	{$N2-$N3}, [$nptr]!
    407 	vmull.u32	$A1xB,$Bi,${A0}[1]
    408 	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
    409 	vmull.u32	$A2xB,$Bi,${A1}[0]
    410 	vmull.u32	$A3xB,$Bi,${A1}[1]
    411 	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
    412 
    413 	vmull.u32	$A4xB,$Bi,${A2}[0]
    414 	vmull.u32	$A5xB,$Bi,${A2}[1]
    415 	vmull.u32	$A6xB,$Bi,${A3}[0]
    416 	vmull.u32	$A7xB,$Bi,${A3}[1]
    417 
    418 	bne	.LNEON_1st
    419 
    420 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    421 	add		$tinptr,sp,#16
    422 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    423 	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
    424 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    425 	 vld1.64	{$Temp}, [sp,:128]
    426 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    427 	sub		$outer,$num,#1
    428 
    429 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    430 	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
    431 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    432 	vshr.u64	$temp,$temp,#16
    433 	 vld1.64	{$A0xB},       [$tinptr, :128]!
    434 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    435 	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
    436 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    437 
    438 	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
    439 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    440 	veor		$Z,$Z,$Z
    441 	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
    442 	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
    443 	vst1.64		{$Z},          [$toutptr,:128]
    444 	vshr.u64	$temp,$temp,#16
    445 
    446 	b		.LNEON_outer
    447 
    448 .align	4
    449 .LNEON_outer:
    450 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
    451 	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
    452 	vld1.32		{$A0-$A3},  [$aptr]!
    453 	veor		$zero,$zero,$zero
    454 	mov		$toutptr,sp
    455 	vzip.16		$Bi,$zero
    456 	sub		$inner,$num,#8
    457 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    458 
    459 	vmlal.u32	$A0xB,$Bi,${A0}[0]
    460 	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
    461 	vmlal.u32	$A1xB,$Bi,${A0}[1]
    462 	vmlal.u32	$A2xB,$Bi,${A1}[0]
    463 	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
    464 	vmlal.u32	$A3xB,$Bi,${A1}[1]
    465 
    466 	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
    467 	veor		$zero,$zero,$zero
    468 	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
    469 	 vld1.64	{$A7xB},[$tinptr,:128]!
    470 	vmul.u32	$Ni,$temp,$M0
    471 
    472 	vmlal.u32	$A4xB,$Bi,${A2}[0]
    473 	 vld1.32	{$N0-$N3}, [$nptr]!
    474 	vmlal.u32	$A5xB,$Bi,${A2}[1]
    475 	vmlal.u32	$A6xB,$Bi,${A3}[0]
    476 	vzip.16		$Ni,$zero
    477 	vmlal.u32	$A7xB,$Bi,${A3}[1]
    478 
    479 .LNEON_inner:
    480 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    481 	 vld1.32	{$A0-$A3}, [$aptr]!
    482 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    483 	 subs		$inner,$inner,#8
    484 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    485 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    486 	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
    487 
    488 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    489 	 vld1.64	{$A0xB},       [$tinptr, :128]!
    490 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    491 	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
    492 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    493 	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
    494 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    495 	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
    496 
    497 	vmlal.u32	$A0xB,$Bi,${A0}[0]
    498 	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
    499 	vmlal.u32	$A1xB,$Bi,${A0}[1]
    500 	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
    501 	vmlal.u32	$A2xB,$Bi,${A1}[0]
    502 	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
    503 	vmlal.u32	$A3xB,$Bi,${A1}[1]
    504 	 vld1.32	{$N0-$N3}, [$nptr]!
    505 
    506 	vmlal.u32	$A4xB,$Bi,${A2}[0]
    507 	 vld1.64	{$A7xB},       [$tinptr, :128]!
    508 	vmlal.u32	$A5xB,$Bi,${A2}[1]
    509 	vmlal.u32	$A6xB,$Bi,${A3}[0]
    510 	vmlal.u32	$A7xB,$Bi,${A3}[1]
    511 
    512 	bne	.LNEON_inner
    513 
    514 	vmlal.u32	$A0xB,$Ni,${N0}[0]
    515 	add		$tinptr,sp,#16
    516 	vmlal.u32	$A1xB,$Ni,${N0}[1]
    517 	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
    518 	vmlal.u32	$A2xB,$Ni,${N1}[0]
    519 	 vld1.64	{$Temp}, [sp,:128]
    520 	vmlal.u32	$A3xB,$Ni,${N1}[1]
    521 	subs		$outer,$outer,#1
    522 
    523 	vmlal.u32	$A4xB,$Ni,${N2}[0]
    524 	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
    525 	vmlal.u32	$A5xB,$Ni,${N2}[1]
    526 	 vld1.64	{$A0xB},       [$tinptr, :128]!
    527 	vshr.u64	$temp,$temp,#16
    528 	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
    529 	vmlal.u32	$A6xB,$Ni,${N3}[0]
    530 	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
    531 	vmlal.u32	$A7xB,$Ni,${N3}[1]
    532 
    533 	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
    534 	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
    535 	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
    536 	vshr.u64	$temp,$temp,#16
    537 
    538 	bne	.LNEON_outer
    539 
    540 	mov		$toutptr,sp
    541 	mov		$inner,$num
    542 
    543 .LNEON_tail:
    544 	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
    545 	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
    546 	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
    547 	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
    548 	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
    549 	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
    550 	vld1.64		{$A7xB},       [$tinptr, :128]!
    551 	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
    552 
    553 .LNEON_tail2:
    554 	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
    555 	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
    556 	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
    557 	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
    558 	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
    559 	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
    560 
    561 	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
    562 	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
    563 	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
    564 	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
    565 	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
    566 	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
    567 
    568 	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
    569 	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
    570 	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
    571 	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
    572 	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
    573 	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
    574 
    575 	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
    576 	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
    577 	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
    578 	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
    579 	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
    580 	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
    581 
    582 	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
    583 	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
    584 	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
    585 	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
    586 	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
    587 	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
    588 
    589 	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
    590 	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
    591 	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
    592 	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
    593 	vld1.64		{$A0xB}, [$tinptr, :128]!
    594 	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
    595 	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
    596 
    597 	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
    598 	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
    599 	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
    600 	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
    601 	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
    602 	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
    603 	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
    604 	subs		$inner,$inner,#8
    605 	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
    606 
    607 	bne	.LNEON_tail
    608 
    609 	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
    610 	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
    611 	subs	$aptr,sp,#0				@ clear carry flag
    612 	add	$bptr,sp,$num,lsl#2
    613 
    614 .LNEON_sub:
    615 	ldmia	$aptr!, {r4-r7}
    616 	ldmia	$nptr!, {r8-r11}
    617 	sbcs	r8, r4,r8
    618 	sbcs	r9, r5,r9
    619 	sbcs	r10,r6,r10
    620 	sbcs	r11,r7,r11
    621 	teq	$aptr,$bptr				@ preserves carry
    622 	stmia	$rptr!, {r8-r11}
    623 	bne	.LNEON_sub
    624 
    625 	ldr	r10, [$aptr]				@ load top-most bit
    626 	veor	q0,q0,q0
    627 	sub	r11,$bptr,sp				@ this is num*4
    628 	veor	q1,q1,q1
    629 	mov	$aptr,sp
    630 	sub	$rptr,$rptr,r11				@ rewind $rptr
    631 	mov	$nptr,$bptr				@ second 3/4th of frame
    632 	sbcs	r10,r10,#0				@ result is carry flag
    633 
    634 .LNEON_copy_n_zap:
    635 	ldmia	$aptr!, {r4-r7}
    636 	ldmia	$rptr,  {r8-r11}
    637 	movcc	r8, r4
    638 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
    639 	movcc	r9, r5
    640 	movcc	r10,r6
    641 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
    642 	movcc	r11,r7
    643 	ldmia	$aptr, {r4-r7}
    644 	stmia	$rptr!, {r8-r11}
    645 	sub	$aptr,$aptr,#16
    646 	ldmia	$rptr, {r8-r11}
    647 	movcc	r8, r4
    648 	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
    649 	movcc	r9, r5
    650 	movcc	r10,r6
    651 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
    652 	movcc	r11,r7
    653 	teq	$aptr,$bptr				@ preserves carry
    654 	stmia	$rptr!, {r8-r11}
    655 	bne	.LNEON_copy_n_zap
    656 
    657 	sub	sp,ip,#96
    658         vldmia  sp!,{d8-d15}
    659         ldmia   sp!,{r4-r11}
    660 	ret						@ bx lr
    661 .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
    662 #endif
    663 ___
    664 }
    665 $code.=<<___;
    666 .asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    667 .align	2
    668 #if __ARM_ARCH__>=7
    669 .comm	OPENSSL_armcap_P,4,4
    670 #endif
    671 ___
    672 
    673 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    674 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    675 $code =~ s/\bret\b/bx	lr/gm;
    676 print $code;
    677 close STDOUT;
    678