Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 
     17 # March 2015
     18 #
     19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
     20 # work. While it does improve RSA sign performance by 20-30% (less for
     21 # longer keys) on most processors, for some reason RSA2048 is not
     22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
     23 # instruction issue rate is limited on processor in question, meaning
     24 # that dedicated squaring procedure is a must. Well, actually all
     25 # contemporary AArch64 processors seem to have limited multiplication
     26 # issue rate, i.e. they can't issue multiplication every cycle, which
     27 # explains moderate improvement coefficients in comparison to
     28 # compiler-generated code. Recall that compiler is instructed to use
     29 # umulh and therefore uses same amount of multiplication instructions
     30 # to do the job. Assembly's edge is to minimize number of "collateral"
     31 # instructions and of course instruction scheduling.
     32 #
     33 # April 2015
     34 #
     35 # Squaring procedure that handles lengths divisible by 8 improves
     36 # RSA/DSA performance by 25-40-60% depending on processor and key
     37 # length. Overall improvement coefficients are always positive in
     38 # comparison to compiler-generated code. On Cortex-A57 improvement
     39 # is still modest on longest key lengths, while others exhibit e.g.
     40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
     41 # on Cortex-A57 and ~60-100% faster on others.
     42 
     43 $flavour = shift;
     44 $output  = shift;
     45 
     46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     48 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
     49 die "can't locate arm-xlate.pl";
     50 
     51 open OUT,"| \"$^X\" $xlate $flavour $output";
     52 *STDOUT=*OUT;
     53 
     54 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
     55  $lo1,$hi1,$nj,$m1,$nlo,$nhi,
     56  $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
     57 
     58 # int bn_mul_mont(
     59 $rp="x0";	# BN_ULONG *rp,
     60 $ap="x1";	# const BN_ULONG *ap,
     61 $bp="x2";	# const BN_ULONG *bp,
     62 $np="x3";	# const BN_ULONG *np,
     63 $n0="x4";	# const BN_ULONG *n0,
     64 $num="x5";	# int num);
     65 
     66 $code.=<<___;
     67 .text
     68 
     69 .globl	bn_mul_mont
     70 .type	bn_mul_mont,%function
     71 .align	5
     72 bn_mul_mont:
     73 	tst	$num,#7
     74 	b.eq	__bn_sqr8x_mont
     75 	tst	$num,#3
     76 	b.eq	__bn_mul4x_mont
     77 .Lmul_mont:
     78 	stp	x29,x30,[sp,#-64]!
     79 	add	x29,sp,#0
     80 	stp	x19,x20,[sp,#16]
     81 	stp	x21,x22,[sp,#32]
     82 	stp	x23,x24,[sp,#48]
     83 
     84 	ldr	$m0,[$bp],#8		// bp[0]
     85 	sub	$tp,sp,$num,lsl#3
     86 	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
     87 	lsl	$num,$num,#3
     88 	ldr	$n0,[$n0]		// *n0
     89 	and	$tp,$tp,#-16		// ABI says so
     90 	ldp	$hi1,$nj,[$np],#16	// np[0..1]
     91 
     92 	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
     93 	sub	$j,$num,#16		// j=num-2
     94 	umulh	$hi0,$hi0,$m0
     95 	mul	$alo,$aj,$m0		// ap[1]*bp[0]
     96 	umulh	$ahi,$aj,$m0
     97 
     98 	mul	$m1,$lo0,$n0		// "tp[0]"*n0
     99 	mov	sp,$tp			// alloca
    100 
    101 	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
    102 	umulh	$hi1,$hi1,$m1
    103 	mul	$nlo,$nj,$m1		// np[1]*m1
    104 	// (*)	adds	$lo1,$lo1,$lo0	// discarded
    105 	// (*)	As for removal of first multiplication and addition
    106 	//	instructions. The outcome of first addition is
    107 	//	guaranteed to be zero, which leaves two computationally
    108 	//	significant outcomes: it either carries or not. Then
    109 	//	question is when does it carry? Is there alternative
    110 	//	way to deduce it? If you follow operations, you can
    111 	//	observe that condition for carry is quite simple:
    112 	//	$lo0 being non-zero. So that carry can be calculated
    113 	//	by adding -1 to $lo0. That's what next instruction does.
    114 	subs	xzr,$lo0,#1		// (*)
    115 	umulh	$nhi,$nj,$m1
    116 	adc	$hi1,$hi1,xzr
    117 	cbz	$j,.L1st_skip
    118 
    119 .L1st:
    120 	ldr	$aj,[$ap],#8
    121 	adds	$lo0,$alo,$hi0
    122 	sub	$j,$j,#8		// j--
    123 	adc	$hi0,$ahi,xzr
    124 
    125 	ldr	$nj,[$np],#8
    126 	adds	$lo1,$nlo,$hi1
    127 	mul	$alo,$aj,$m0		// ap[j]*bp[0]
    128 	adc	$hi1,$nhi,xzr
    129 	umulh	$ahi,$aj,$m0
    130 
    131 	adds	$lo1,$lo1,$lo0
    132 	mul	$nlo,$nj,$m1		// np[j]*m1
    133 	adc	$hi1,$hi1,xzr
    134 	umulh	$nhi,$nj,$m1
    135 	str	$lo1,[$tp],#8		// tp[j-1]
    136 	cbnz	$j,.L1st
    137 
    138 .L1st_skip:
    139 	adds	$lo0,$alo,$hi0
    140 	sub	$ap,$ap,$num		// rewind $ap
    141 	adc	$hi0,$ahi,xzr
    142 
    143 	adds	$lo1,$nlo,$hi1
    144 	sub	$np,$np,$num		// rewind $np
    145 	adc	$hi1,$nhi,xzr
    146 
    147 	adds	$lo1,$lo1,$lo0
    148 	sub	$i,$num,#8		// i=num-1
    149 	adcs	$hi1,$hi1,$hi0
    150 
    151 	adc	$ovf,xzr,xzr		// upmost overflow bit
    152 	stp	$lo1,$hi1,[$tp]
    153 
    154 .Louter:
    155 	ldr	$m0,[$bp],#8		// bp[i]
    156 	ldp	$hi0,$aj,[$ap],#16
    157 	ldr	$tj,[sp]		// tp[0]
    158 	add	$tp,sp,#8
    159 
    160 	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
    161 	sub	$j,$num,#16		// j=num-2
    162 	umulh	$hi0,$hi0,$m0
    163 	ldp	$hi1,$nj,[$np],#16
    164 	mul	$alo,$aj,$m0		// ap[1]*bp[i]
    165 	adds	$lo0,$lo0,$tj
    166 	umulh	$ahi,$aj,$m0
    167 	adc	$hi0,$hi0,xzr
    168 
    169 	mul	$m1,$lo0,$n0
    170 	sub	$i,$i,#8		// i--
    171 
    172 	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
    173 	umulh	$hi1,$hi1,$m1
    174 	mul	$nlo,$nj,$m1		// np[1]*m1
    175 	// (*)	adds	$lo1,$lo1,$lo0
    176 	subs	xzr,$lo0,#1		// (*)
    177 	umulh	$nhi,$nj,$m1
    178 	cbz	$j,.Linner_skip
    179 
    180 .Linner:
    181 	ldr	$aj,[$ap],#8
    182 	adc	$hi1,$hi1,xzr
    183 	ldr	$tj,[$tp],#8		// tp[j]
    184 	adds	$lo0,$alo,$hi0
    185 	sub	$j,$j,#8		// j--
    186 	adc	$hi0,$ahi,xzr
    187 
    188 	adds	$lo1,$nlo,$hi1
    189 	ldr	$nj,[$np],#8
    190 	adc	$hi1,$nhi,xzr
    191 
    192 	mul	$alo,$aj,$m0		// ap[j]*bp[i]
    193 	adds	$lo0,$lo0,$tj
    194 	umulh	$ahi,$aj,$m0
    195 	adc	$hi0,$hi0,xzr
    196 
    197 	mul	$nlo,$nj,$m1		// np[j]*m1
    198 	adds	$lo1,$lo1,$lo0
    199 	umulh	$nhi,$nj,$m1
    200 	str	$lo1,[$tp,#-16]		// tp[j-1]
    201 	cbnz	$j,.Linner
    202 
    203 .Linner_skip:
    204 	ldr	$tj,[$tp],#8		// tp[j]
    205 	adc	$hi1,$hi1,xzr
    206 	adds	$lo0,$alo,$hi0
    207 	sub	$ap,$ap,$num		// rewind $ap
    208 	adc	$hi0,$ahi,xzr
    209 
    210 	adds	$lo1,$nlo,$hi1
    211 	sub	$np,$np,$num		// rewind $np
    212 	adcs	$hi1,$nhi,$ovf
    213 	adc	$ovf,xzr,xzr
    214 
    215 	adds	$lo0,$lo0,$tj
    216 	adc	$hi0,$hi0,xzr
    217 
    218 	adds	$lo1,$lo1,$lo0
    219 	adcs	$hi1,$hi1,$hi0
    220 	adc	$ovf,$ovf,xzr		// upmost overflow bit
    221 	stp	$lo1,$hi1,[$tp,#-16]
    222 
    223 	cbnz	$i,.Louter
    224 
    225 	// Final step. We see if result is larger than modulus, and
    226 	// if it is, subtract the modulus. But comparison implies
    227 	// subtraction. So we subtract modulus, see if it borrowed,
    228 	// and conditionally copy original value.
    229 	ldr	$tj,[sp]		// tp[0]
    230 	add	$tp,sp,#8
    231 	ldr	$nj,[$np],#8		// np[0]
    232 	subs	$j,$num,#8		// j=num-1 and clear borrow
    233 	mov	$ap,$rp
    234 .Lsub:
    235 	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
    236 	ldr	$tj,[$tp],#8
    237 	sub	$j,$j,#8		// j--
    238 	ldr	$nj,[$np],#8
    239 	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
    240 	cbnz	$j,.Lsub
    241 
    242 	sbcs	$aj,$tj,$nj
    243 	sbcs	$ovf,$ovf,xzr		// did it borrow?
    244 	str	$aj,[$ap],#8		// rp[num-1]
    245 
    246 	ldr	$tj,[sp]		// tp[0]
    247 	add	$tp,sp,#8
    248 	ldr	$aj,[$rp],#8		// rp[0]
    249 	sub	$num,$num,#8		// num--
    250 	nop
    251 .Lcond_copy:
    252 	sub	$num,$num,#8		// num--
    253 	csel	$nj,$tj,$aj,lo		// did it borrow?
    254 	ldr	$tj,[$tp],#8
    255 	ldr	$aj,[$rp],#8
    256 	str	xzr,[$tp,#-16]		// wipe tp
    257 	str	$nj,[$rp,#-16]
    258 	cbnz	$num,.Lcond_copy
    259 
    260 	csel	$nj,$tj,$aj,lo
    261 	str	xzr,[$tp,#-8]		// wipe tp
    262 	str	$nj,[$rp,#-8]
    263 
    264 	ldp	x19,x20,[x29,#16]
    265 	mov	sp,x29
    266 	ldp	x21,x22,[x29,#32]
    267 	mov	x0,#1
    268 	ldp	x23,x24,[x29,#48]
    269 	ldr	x29,[sp],#64
    270 	ret
    271 .size	bn_mul_mont,.-bn_mul_mont
    272 ___
    273 {
    274 ########################################################################
    275 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
    276 
    277 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
    278 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
    279 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
    280 my ($cnt,$carry,$topmost)=("x27","x28","x30");
    281 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
    282 
    283 $code.=<<___;
    284 .type	__bn_sqr8x_mont,%function
    285 .align	5
    286 __bn_sqr8x_mont:
    287 	cmp	$ap,$bp
    288 	b.ne	__bn_mul4x_mont
    289 .Lsqr8x_mont:
    290 	stp	x29,x30,[sp,#-128]!
    291 	add	x29,sp,#0
    292 	stp	x19,x20,[sp,#16]
    293 	stp	x21,x22,[sp,#32]
    294 	stp	x23,x24,[sp,#48]
    295 	stp	x25,x26,[sp,#64]
    296 	stp	x27,x28,[sp,#80]
    297 	stp	$rp,$np,[sp,#96]	// offload rp and np
    298 
    299 	ldp	$a0,$a1,[$ap,#8*0]
    300 	ldp	$a2,$a3,[$ap,#8*2]
    301 	ldp	$a4,$a5,[$ap,#8*4]
    302 	ldp	$a6,$a7,[$ap,#8*6]
    303 
    304 	sub	$tp,sp,$num,lsl#4
    305 	lsl	$num,$num,#3
    306 	ldr	$n0,[$n0]		// *n0
    307 	mov	sp,$tp			// alloca
    308 	sub	$cnt,$num,#8*8
    309 	b	.Lsqr8x_zero_start
    310 
    311 .Lsqr8x_zero:
    312 	sub	$cnt,$cnt,#8*8
    313 	stp	xzr,xzr,[$tp,#8*0]
    314 	stp	xzr,xzr,[$tp,#8*2]
    315 	stp	xzr,xzr,[$tp,#8*4]
    316 	stp	xzr,xzr,[$tp,#8*6]
    317 .Lsqr8x_zero_start:
    318 	stp	xzr,xzr,[$tp,#8*8]
    319 	stp	xzr,xzr,[$tp,#8*10]
    320 	stp	xzr,xzr,[$tp,#8*12]
    321 	stp	xzr,xzr,[$tp,#8*14]
    322 	add	$tp,$tp,#8*16
    323 	cbnz	$cnt,.Lsqr8x_zero
    324 
    325 	add	$ap_end,$ap,$num
    326 	add	$ap,$ap,#8*8
    327 	mov	$acc0,xzr
    328 	mov	$acc1,xzr
    329 	mov	$acc2,xzr
    330 	mov	$acc3,xzr
    331 	mov	$acc4,xzr
    332 	mov	$acc5,xzr
    333 	mov	$acc6,xzr
    334 	mov	$acc7,xzr
    335 	mov	$tp,sp
    336 	str	$n0,[x29,#112]		// offload n0
    337 
    338 	// Multiply everything but a[i]*a[i]
    339 .align	4
    340 .Lsqr8x_outer_loop:
    341         //                                                 a[1]a[0]	(i)
    342         //                                             a[2]a[0]
    343         //                                         a[3]a[0]
    344         //                                     a[4]a[0]
    345         //                                 a[5]a[0]
    346         //                             a[6]a[0]
    347         //                         a[7]a[0]
    348         //                                         a[2]a[1]		(ii)
    349         //                                     a[3]a[1]
    350         //                                 a[4]a[1]
    351         //                             a[5]a[1]
    352         //                         a[6]a[1]
    353         //                     a[7]a[1]
    354         //                                 a[3]a[2]			(iii)
    355         //                             a[4]a[2]
    356         //                         a[5]a[2]
    357         //                     a[6]a[2]
    358         //                 a[7]a[2]
    359         //                         a[4]a[3]				(iv)
    360         //                     a[5]a[3]
    361         //                 a[6]a[3]
    362         //             a[7]a[3]
    363         //                 a[5]a[4]					(v)
    364         //             a[6]a[4]
    365         //         a[7]a[4]
    366         //         a[6]a[5]						(vi)
    367         //     a[7]a[5]
    368         // a[7]a[6]							(vii)
    369 
    370 	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
    371 	mul	$t1,$a2,$a0
    372 	mul	$t2,$a3,$a0
    373 	mul	$t3,$a4,$a0
    374 	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
    375 	mul	$t0,$a5,$a0
    376 	adcs	$acc2,$acc2,$t1
    377 	mul	$t1,$a6,$a0
    378 	adcs	$acc3,$acc3,$t2
    379 	mul	$t2,$a7,$a0
    380 	adcs	$acc4,$acc4,$t3
    381 	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
    382 	adcs	$acc5,$acc5,$t0
    383 	umulh	$t0,$a2,$a0
    384 	adcs	$acc6,$acc6,$t1
    385 	umulh	$t1,$a3,$a0
    386 	adcs	$acc7,$acc7,$t2
    387 	umulh	$t2,$a4,$a0
    388 	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
    389 	adc	$acc0,xzr,xzr		// t[8]
    390 	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
    391 	umulh	$t3,$a5,$a0
    392 	adcs	$acc3,$acc3,$t0
    393 	umulh	$t0,$a6,$a0
    394 	adcs	$acc4,$acc4,$t1
    395 	umulh	$t1,$a7,$a0
    396 	adcs	$acc5,$acc5,$t2
    397 	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
    398 	adcs	$acc6,$acc6,$t3
    399 	 mul	$t3,$a3,$a1
    400 	adcs	$acc7,$acc7,$t0
    401 	 mul	$t0,$a4,$a1
    402 	adc	$acc0,$acc0,$t1
    403 
    404 	mul	$t1,$a5,$a1
    405 	adds	$acc3,$acc3,$t2
    406 	mul	$t2,$a6,$a1
    407 	adcs	$acc4,$acc4,$t3
    408 	mul	$t3,$a7,$a1
    409 	adcs	$acc5,$acc5,$t0
    410 	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
    411 	adcs	$acc6,$acc6,$t1
    412 	umulh	$t1,$a3,$a1
    413 	adcs	$acc7,$acc7,$t2
    414 	umulh	$t2,$a4,$a1
    415 	adcs	$acc0,$acc0,$t3
    416 	umulh	$t3,$a5,$a1
    417 	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
    418 	adc	$acc1,xzr,xzr		// t[9]
    419 	adds	$acc4,$acc4,$t0
    420 	umulh	$t0,$a6,$a1
    421 	adcs	$acc5,$acc5,$t1
    422 	umulh	$t1,$a7,$a1
    423 	adcs	$acc6,$acc6,$t2
    424 	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
    425 	adcs	$acc7,$acc7,$t3
    426 	 mul	$t3,$a4,$a2
    427 	adcs	$acc0,$acc0,$t0
    428 	 mul	$t0,$a5,$a2
    429 	adc	$acc1,$acc1,$t1
    430 
    431 	mul	$t1,$a6,$a2
    432 	adds	$acc5,$acc5,$t2
    433 	mul	$t2,$a7,$a2
    434 	adcs	$acc6,$acc6,$t3
    435 	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
    436 	adcs	$acc7,$acc7,$t0
    437 	umulh	$t0,$a4,$a2
    438 	adcs	$acc0,$acc0,$t1
    439 	umulh	$t1,$a5,$a2
    440 	adcs	$acc1,$acc1,$t2
    441 	umulh	$t2,$a6,$a2
    442 	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
    443 	adc	$acc2,xzr,xzr		// t[10]
    444 	adds	$acc6,$acc6,$t3
    445 	umulh	$t3,$a7,$a2
    446 	adcs	$acc7,$acc7,$t0
    447 	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
    448 	adcs	$acc0,$acc0,$t1
    449 	 mul	$t1,$a5,$a3
    450 	adcs	$acc1,$acc1,$t2
    451 	 mul	$t2,$a6,$a3
    452 	adc	$acc2,$acc2,$t3
    453 
    454 	mul	$t3,$a7,$a3
    455 	adds	$acc7,$acc7,$t0
    456 	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
    457 	adcs	$acc0,$acc0,$t1
    458 	umulh	$t1,$a5,$a3
    459 	adcs	$acc1,$acc1,$t2
    460 	umulh	$t2,$a6,$a3
    461 	adcs	$acc2,$acc2,$t3
    462 	umulh	$t3,$a7,$a3
    463 	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
    464 	adc	$acc3,xzr,xzr		// t[11]
    465 	adds	$acc0,$acc0,$t0
    466 	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
    467 	adcs	$acc1,$acc1,$t1
    468 	 mul	$t1,$a6,$a4
    469 	adcs	$acc2,$acc2,$t2
    470 	 mul	$t2,$a7,$a4
    471 	adc	$acc3,$acc3,$t3
    472 
    473 	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
    474 	adds	$acc1,$acc1,$t0
    475 	umulh	$t0,$a6,$a4
    476 	adcs	$acc2,$acc2,$t1
    477 	umulh	$t1,$a7,$a4
    478 	adcs	$acc3,$acc3,$t2
    479 	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
    480 	adc	$acc4,xzr,xzr		// t[12]
    481 	adds	$acc2,$acc2,$t3
    482 	 mul	$t3,$a7,$a5
    483 	adcs	$acc3,$acc3,$t0
    484 	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
    485 	adc	$acc4,$acc4,$t1
    486 
    487 	umulh	$t1,$a7,$a5
    488 	adds	$acc3,$acc3,$t2
    489 	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
    490 	adcs	$acc4,$acc4,$t3
    491 	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
    492 	adc	$acc5,xzr,xzr		// t[13]
    493 	adds	$acc4,$acc4,$t0
    494 	sub	$cnt,$ap_end,$ap	// done yet?
    495 	adc	$acc5,$acc5,$t1
    496 
    497 	adds	$acc5,$acc5,$t2
    498 	sub	$t0,$ap_end,$num	// rewinded ap
    499 	adc	$acc6,xzr,xzr		// t[14]
    500 	add	$acc6,$acc6,$t3
    501 
    502 	cbz	$cnt,.Lsqr8x_outer_break
    503 
    504 	mov	$n0,$a0
    505 	ldp	$a0,$a1,[$tp,#8*0]
    506 	ldp	$a2,$a3,[$tp,#8*2]
    507 	ldp	$a4,$a5,[$tp,#8*4]
    508 	ldp	$a6,$a7,[$tp,#8*6]
    509 	adds	$acc0,$acc0,$a0
    510 	adcs	$acc1,$acc1,$a1
    511 	ldp	$a0,$a1,[$ap,#8*0]
    512 	adcs	$acc2,$acc2,$a2
    513 	adcs	$acc3,$acc3,$a3
    514 	ldp	$a2,$a3,[$ap,#8*2]
    515 	adcs	$acc4,$acc4,$a4
    516 	adcs	$acc5,$acc5,$a5
    517 	ldp	$a4,$a5,[$ap,#8*4]
    518 	adcs	$acc6,$acc6,$a6
    519 	mov	$rp,$ap
    520 	adcs	$acc7,xzr,$a7
    521 	ldp	$a6,$a7,[$ap,#8*6]
    522 	add	$ap,$ap,#8*8
    523 	//adc	$carry,xzr,xzr		// moved below
    524 	mov	$cnt,#-8*8
    525 
    526 	//                                                         a[8]a[0]
    527 	//                                                     a[9]a[0]
    528 	//                                                 a[a]a[0]
    529 	//                                             a[b]a[0]
    530 	//                                         a[c]a[0]
    531 	//                                     a[d]a[0]
    532 	//                                 a[e]a[0]
    533 	//                             a[f]a[0]
    534 	//                                                     a[8]a[1]
    535 	//                         a[f]a[1]........................
    536 	//                                                 a[8]a[2]
    537 	//                     a[f]a[2]........................
    538 	//                                             a[8]a[3]
    539 	//                 a[f]a[3]........................
    540 	//                                         a[8]a[4]
    541 	//             a[f]a[4]........................
    542 	//                                     a[8]a[5]
    543 	//         a[f]a[5]........................
    544 	//                                 a[8]a[6]
    545 	//     a[f]a[6]........................
    546 	//                             a[8]a[7]
    547 	// a[f]a[7]........................
    548 .Lsqr8x_mul:
    549 	mul	$t0,$a0,$n0
    550 	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
    551 	mul	$t1,$a1,$n0
    552 	add	$cnt,$cnt,#8
    553 	mul	$t2,$a2,$n0
    554 	mul	$t3,$a3,$n0
    555 	adds	$acc0,$acc0,$t0
    556 	mul	$t0,$a4,$n0
    557 	adcs	$acc1,$acc1,$t1
    558 	mul	$t1,$a5,$n0
    559 	adcs	$acc2,$acc2,$t2
    560 	mul	$t2,$a6,$n0
    561 	adcs	$acc3,$acc3,$t3
    562 	mul	$t3,$a7,$n0
    563 	adcs	$acc4,$acc4,$t0
    564 	umulh	$t0,$a0,$n0
    565 	adcs	$acc5,$acc5,$t1
    566 	umulh	$t1,$a1,$n0
    567 	adcs	$acc6,$acc6,$t2
    568 	umulh	$t2,$a2,$n0
    569 	adcs	$acc7,$acc7,$t3
    570 	umulh	$t3,$a3,$n0
    571 	adc	$carry,$carry,xzr
    572 	str	$acc0,[$tp],#8
    573 	adds	$acc0,$acc1,$t0
    574 	umulh	$t0,$a4,$n0
    575 	adcs	$acc1,$acc2,$t1
    576 	umulh	$t1,$a5,$n0
    577 	adcs	$acc2,$acc3,$t2
    578 	umulh	$t2,$a6,$n0
    579 	adcs	$acc3,$acc4,$t3
    580 	umulh	$t3,$a7,$n0
    581 	ldr	$n0,[$rp,$cnt]
    582 	adcs	$acc4,$acc5,$t0
    583 	adcs	$acc5,$acc6,$t1
    584 	adcs	$acc6,$acc7,$t2
    585 	adcs	$acc7,$carry,$t3
    586 	//adc	$carry,xzr,xzr		// moved above
    587 	cbnz	$cnt,.Lsqr8x_mul
    588 					// note that carry flag is guaranteed
    589 					// to be zero at this point
    590 	cmp	$ap,$ap_end		// done yet?
    591 	b.eq	.Lsqr8x_break
    592 
    593 	ldp	$a0,$a1,[$tp,#8*0]
    594 	ldp	$a2,$a3,[$tp,#8*2]
    595 	ldp	$a4,$a5,[$tp,#8*4]
    596 	ldp	$a6,$a7,[$tp,#8*6]
    597 	adds	$acc0,$acc0,$a0
    598 	ldr	$n0,[$rp,#-8*8]
    599 	adcs	$acc1,$acc1,$a1
    600 	ldp	$a0,$a1,[$ap,#8*0]
    601 	adcs	$acc2,$acc2,$a2
    602 	adcs	$acc3,$acc3,$a3
    603 	ldp	$a2,$a3,[$ap,#8*2]
    604 	adcs	$acc4,$acc4,$a4
    605 	adcs	$acc5,$acc5,$a5
    606 	ldp	$a4,$a5,[$ap,#8*4]
    607 	adcs	$acc6,$acc6,$a6
    608 	mov	$cnt,#-8*8
    609 	adcs	$acc7,$acc7,$a7
    610 	ldp	$a6,$a7,[$ap,#8*6]
    611 	add	$ap,$ap,#8*8
    612 	//adc	$carry,xzr,xzr		// moved above
    613 	b	.Lsqr8x_mul
    614 
    615 .align	4
    616 .Lsqr8x_break:
    617 	ldp	$a0,$a1,[$rp,#8*0]
    618 	add	$ap,$rp,#8*8
    619 	ldp	$a2,$a3,[$rp,#8*2]
    620 	sub	$t0,$ap_end,$ap		// is it last iteration?
    621 	ldp	$a4,$a5,[$rp,#8*4]
    622 	sub	$t1,$tp,$t0
    623 	ldp	$a6,$a7,[$rp,#8*6]
    624 	cbz	$t0,.Lsqr8x_outer_loop
    625 
    626 	stp	$acc0,$acc1,[$tp,#8*0]
    627 	ldp	$acc0,$acc1,[$t1,#8*0]
    628 	stp	$acc2,$acc3,[$tp,#8*2]
    629 	ldp	$acc2,$acc3,[$t1,#8*2]
    630 	stp	$acc4,$acc5,[$tp,#8*4]
    631 	ldp	$acc4,$acc5,[$t1,#8*4]
    632 	stp	$acc6,$acc7,[$tp,#8*6]
    633 	mov	$tp,$t1
    634 	ldp	$acc6,$acc7,[$t1,#8*6]
    635 	b	.Lsqr8x_outer_loop
    636 
    637 .align	4
    638 .Lsqr8x_outer_break:
    639 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
    640 	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
    641 	ldp	$t1,$t2,[sp,#8*1]
    642 	ldp	$a5,$a7,[$t0,#8*2]
    643 	add	$ap,$t0,#8*4
    644 	ldp	$t3,$t0,[sp,#8*3]
    645 
    646 	stp	$acc0,$acc1,[$tp,#8*0]
    647 	mul	$acc0,$a1,$a1
    648 	stp	$acc2,$acc3,[$tp,#8*2]
    649 	umulh	$a1,$a1,$a1
    650 	stp	$acc4,$acc5,[$tp,#8*4]
    651 	mul	$a2,$a3,$a3
    652 	stp	$acc6,$acc7,[$tp,#8*6]
    653 	mov	$tp,sp
    654 	umulh	$a3,$a3,$a3
    655 	adds	$acc1,$a1,$t1,lsl#1
    656 	extr	$t1,$t2,$t1,#63
    657 	sub	$cnt,$num,#8*4
    658 
    659 .Lsqr4x_shift_n_add:
    660 	adcs	$acc2,$a2,$t1
    661 	extr	$t2,$t3,$t2,#63
    662 	sub	$cnt,$cnt,#8*4
    663 	adcs	$acc3,$a3,$t2
    664 	ldp	$t1,$t2,[$tp,#8*5]
    665 	mul	$a4,$a5,$a5
    666 	ldp	$a1,$a3,[$ap],#8*2
    667 	umulh	$a5,$a5,$a5
    668 	mul	$a6,$a7,$a7
    669 	umulh	$a7,$a7,$a7
    670 	extr	$t3,$t0,$t3,#63
    671 	stp	$acc0,$acc1,[$tp,#8*0]
    672 	adcs	$acc4,$a4,$t3
    673 	extr	$t0,$t1,$t0,#63
    674 	stp	$acc2,$acc3,[$tp,#8*2]
    675 	adcs	$acc5,$a5,$t0
    676 	ldp	$t3,$t0,[$tp,#8*7]
    677 	extr	$t1,$t2,$t1,#63
    678 	adcs	$acc6,$a6,$t1
    679 	extr	$t2,$t3,$t2,#63
    680 	adcs	$acc7,$a7,$t2
    681 	ldp	$t1,$t2,[$tp,#8*9]
    682 	mul	$a0,$a1,$a1
    683 	ldp	$a5,$a7,[$ap],#8*2
    684 	umulh	$a1,$a1,$a1
    685 	mul	$a2,$a3,$a3
    686 	umulh	$a3,$a3,$a3
    687 	stp	$acc4,$acc5,[$tp,#8*4]
    688 	extr	$t3,$t0,$t3,#63
    689 	stp	$acc6,$acc7,[$tp,#8*6]
    690 	add	$tp,$tp,#8*8
    691 	adcs	$acc0,$a0,$t3
    692 	extr	$t0,$t1,$t0,#63
    693 	adcs	$acc1,$a1,$t0
    694 	ldp	$t3,$t0,[$tp,#8*3]
    695 	extr	$t1,$t2,$t1,#63
    696 	cbnz	$cnt,.Lsqr4x_shift_n_add
    697 ___
    698 my ($np,$np_end)=($ap,$ap_end);
    699 $code.=<<___;
    700 	 ldp	$np,$n0,[x29,#104]	// pull np and n0
    701 
    702 	adcs	$acc2,$a2,$t1
    703 	extr	$t2,$t3,$t2,#63
    704 	adcs	$acc3,$a3,$t2
    705 	ldp	$t1,$t2,[$tp,#8*5]
    706 	mul	$a4,$a5,$a5
    707 	umulh	$a5,$a5,$a5
    708 	stp	$acc0,$acc1,[$tp,#8*0]
    709 	mul	$a6,$a7,$a7
    710 	umulh	$a7,$a7,$a7
    711 	stp	$acc2,$acc3,[$tp,#8*2]
    712 	extr	$t3,$t0,$t3,#63
    713 	adcs	$acc4,$a4,$t3
    714 	extr	$t0,$t1,$t0,#63
    715 	 ldp	$acc0,$acc1,[sp,#8*0]
    716 	adcs	$acc5,$a5,$t0
    717 	extr	$t1,$t2,$t1,#63
    718 	 ldp	$a0,$a1,[$np,#8*0]
    719 	adcs	$acc6,$a6,$t1
    720 	extr	$t2,xzr,$t2,#63
    721 	 ldp	$a2,$a3,[$np,#8*2]
    722 	adc	$acc7,$a7,$t2
    723 	 ldp	$a4,$a5,[$np,#8*4]
    724 
    725 	// Reduce by 512 bits per iteration
    726 	mul	$na0,$n0,$acc0		// t[0]*n0
    727 	ldp	$a6,$a7,[$np,#8*6]
    728 	add	$np_end,$np,$num
    729 	ldp	$acc2,$acc3,[sp,#8*2]
    730 	stp	$acc4,$acc5,[$tp,#8*4]
    731 	ldp	$acc4,$acc5,[sp,#8*4]
    732 	stp	$acc6,$acc7,[$tp,#8*6]
    733 	ldp	$acc6,$acc7,[sp,#8*6]
    734 	add	$np,$np,#8*8
    735 	mov	$topmost,xzr		// initial top-most carry
    736 	mov	$tp,sp
    737 	mov	$cnt,#8
    738 
    739 .Lsqr8x_reduction:
    740 	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
    741 	mul	$t1,$a1,$na0
    742 	sub	$cnt,$cnt,#1
    743 	mul	$t2,$a2,$na0
    744 	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
    745 	mul	$t3,$a3,$na0
    746 	// (*)	adds	xzr,$acc0,$t0
    747 	subs	xzr,$acc0,#1		// (*)
    748 	mul	$t0,$a4,$na0
    749 	adcs	$acc0,$acc1,$t1
    750 	mul	$t1,$a5,$na0
    751 	adcs	$acc1,$acc2,$t2
    752 	mul	$t2,$a6,$na0
    753 	adcs	$acc2,$acc3,$t3
    754 	mul	$t3,$a7,$na0
    755 	adcs	$acc3,$acc4,$t0
    756 	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
    757 	adcs	$acc4,$acc5,$t1
    758 	umulh	$t1,$a1,$na0
    759 	adcs	$acc5,$acc6,$t2
    760 	umulh	$t2,$a2,$na0
    761 	adcs	$acc6,$acc7,$t3
    762 	umulh	$t3,$a3,$na0
    763 	adc	$acc7,xzr,xzr
    764 	adds	$acc0,$acc0,$t0
    765 	umulh	$t0,$a4,$na0
    766 	adcs	$acc1,$acc1,$t1
    767 	umulh	$t1,$a5,$na0
    768 	adcs	$acc2,$acc2,$t2
    769 	umulh	$t2,$a6,$na0
    770 	adcs	$acc3,$acc3,$t3
    771 	umulh	$t3,$a7,$na0
    772 	mul	$na0,$n0,$acc0		// next t[0]*n0
    773 	adcs	$acc4,$acc4,$t0
    774 	adcs	$acc5,$acc5,$t1
    775 	adcs	$acc6,$acc6,$t2
    776 	adc	$acc7,$acc7,$t3
    777 	cbnz	$cnt,.Lsqr8x_reduction
    778 
    779 	ldp	$t0,$t1,[$tp,#8*0]
    780 	ldp	$t2,$t3,[$tp,#8*2]
    781 	mov	$rp,$tp
    782 	sub	$cnt,$np_end,$np	// done yet?
    783 	adds	$acc0,$acc0,$t0
    784 	adcs	$acc1,$acc1,$t1
    785 	ldp	$t0,$t1,[$tp,#8*4]
    786 	adcs	$acc2,$acc2,$t2
    787 	adcs	$acc3,$acc3,$t3
    788 	ldp	$t2,$t3,[$tp,#8*6]
    789 	adcs	$acc4,$acc4,$t0
    790 	adcs	$acc5,$acc5,$t1
    791 	adcs	$acc6,$acc6,$t2
    792 	adcs	$acc7,$acc7,$t3
    793 	//adc	$carry,xzr,xzr		// moved below
    794 	cbz	$cnt,.Lsqr8x8_post_condition
    795 
    796 	ldr	$n0,[$tp,#-8*8]
    797 	ldp	$a0,$a1,[$np,#8*0]
    798 	ldp	$a2,$a3,[$np,#8*2]
    799 	ldp	$a4,$a5,[$np,#8*4]
    800 	mov	$cnt,#-8*8
    801 	ldp	$a6,$a7,[$np,#8*6]
    802 	add	$np,$np,#8*8
    803 
    804 .Lsqr8x_tail:
    805 	mul	$t0,$a0,$n0
    806 	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
    807 	mul	$t1,$a1,$n0
    808 	add	$cnt,$cnt,#8
    809 	mul	$t2,$a2,$n0
    810 	mul	$t3,$a3,$n0
    811 	adds	$acc0,$acc0,$t0
    812 	mul	$t0,$a4,$n0
    813 	adcs	$acc1,$acc1,$t1
    814 	mul	$t1,$a5,$n0
    815 	adcs	$acc2,$acc2,$t2
    816 	mul	$t2,$a6,$n0
    817 	adcs	$acc3,$acc3,$t3
    818 	mul	$t3,$a7,$n0
    819 	adcs	$acc4,$acc4,$t0
    820 	umulh	$t0,$a0,$n0
    821 	adcs	$acc5,$acc5,$t1
    822 	umulh	$t1,$a1,$n0
    823 	adcs	$acc6,$acc6,$t2
    824 	umulh	$t2,$a2,$n0
    825 	adcs	$acc7,$acc7,$t3
    826 	umulh	$t3,$a3,$n0
    827 	adc	$carry,$carry,xzr
    828 	str	$acc0,[$tp],#8
    829 	adds	$acc0,$acc1,$t0
    830 	umulh	$t0,$a4,$n0
    831 	adcs	$acc1,$acc2,$t1
    832 	umulh	$t1,$a5,$n0
    833 	adcs	$acc2,$acc3,$t2
    834 	umulh	$t2,$a6,$n0
    835 	adcs	$acc3,$acc4,$t3
    836 	umulh	$t3,$a7,$n0
    837 	ldr	$n0,[$rp,$cnt]
    838 	adcs	$acc4,$acc5,$t0
    839 	adcs	$acc5,$acc6,$t1
    840 	adcs	$acc6,$acc7,$t2
    841 	adcs	$acc7,$carry,$t3
    842 	//adc	$carry,xzr,xzr		// moved above
    843 	cbnz	$cnt,.Lsqr8x_tail
    844 					// note that carry flag is guaranteed
    845 					// to be zero at this point
    846 	ldp	$a0,$a1,[$tp,#8*0]
    847 	sub	$cnt,$np_end,$np	// done yet?
    848 	sub	$t2,$np_end,$num	// rewinded np
    849 	ldp	$a2,$a3,[$tp,#8*2]
    850 	ldp	$a4,$a5,[$tp,#8*4]
    851 	ldp	$a6,$a7,[$tp,#8*6]
    852 	cbz	$cnt,.Lsqr8x_tail_break
    853 
    854 	ldr	$n0,[$rp,#-8*8]
    855 	adds	$acc0,$acc0,$a0
    856 	adcs	$acc1,$acc1,$a1
    857 	ldp	$a0,$a1,[$np,#8*0]
    858 	adcs	$acc2,$acc2,$a2
    859 	adcs	$acc3,$acc3,$a3
    860 	ldp	$a2,$a3,[$np,#8*2]
    861 	adcs	$acc4,$acc4,$a4
    862 	adcs	$acc5,$acc5,$a5
    863 	ldp	$a4,$a5,[$np,#8*4]
    864 	adcs	$acc6,$acc6,$a6
    865 	mov	$cnt,#-8*8
    866 	adcs	$acc7,$acc7,$a7
    867 	ldp	$a6,$a7,[$np,#8*6]
    868 	add	$np,$np,#8*8
    869 	//adc	$carry,xzr,xzr		// moved above
    870 	b	.Lsqr8x_tail
    871 
    872 .align	4
    873 .Lsqr8x_tail_break:
    874 	ldr	$n0,[x29,#112]		// pull n0
    875 	add	$cnt,$tp,#8*8		// end of current t[num] window
    876 
    877 	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
    878 	adcs	$t0,$acc0,$a0
    879 	adcs	$t1,$acc1,$a1
    880 	ldp	$acc0,$acc1,[$rp,#8*0]
    881 	adcs	$acc2,$acc2,$a2
    882 	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
    883 	adcs	$acc3,$acc3,$a3
    884 	ldp	$a2,$a3,[$t2,#8*2]
    885 	adcs	$acc4,$acc4,$a4
    886 	adcs	$acc5,$acc5,$a5
    887 	ldp	$a4,$a5,[$t2,#8*4]
    888 	adcs	$acc6,$acc6,$a6
    889 	adcs	$acc7,$acc7,$a7
    890 	ldp	$a6,$a7,[$t2,#8*6]
    891 	add	$np,$t2,#8*8
    892 	adc	$topmost,xzr,xzr	// top-most carry
    893 	mul	$na0,$n0,$acc0
    894 	stp	$t0,$t1,[$tp,#8*0]
    895 	stp	$acc2,$acc3,[$tp,#8*2]
    896 	ldp	$acc2,$acc3,[$rp,#8*2]
    897 	stp	$acc4,$acc5,[$tp,#8*4]
    898 	ldp	$acc4,$acc5,[$rp,#8*4]
    899 	cmp	$cnt,x29		// did we hit the bottom?
    900 	stp	$acc6,$acc7,[$tp,#8*6]
    901 	mov	$tp,$rp			// slide the window
    902 	ldp	$acc6,$acc7,[$rp,#8*6]
    903 	mov	$cnt,#8
    904 	b.ne	.Lsqr8x_reduction
    905 
    906 	// Final step. We see if result is larger than modulus, and
    907 	// if it is, subtract the modulus. But comparison implies
    908 	// subtraction. So we subtract modulus, see if it borrowed,
    909 	// and conditionally copy original value.
    910 	ldr	$rp,[x29,#96]		// pull rp
    911 	add	$tp,$tp,#8*8
    912 	subs	$t0,$acc0,$a0
    913 	sbcs	$t1,$acc1,$a1
    914 	sub	$cnt,$num,#8*8
    915 	mov	$ap_end,$rp		// $rp copy
    916 
    917 .Lsqr8x_sub:
    918 	sbcs	$t2,$acc2,$a2
    919 	ldp	$a0,$a1,[$np,#8*0]
    920 	sbcs	$t3,$acc3,$a3
    921 	stp	$t0,$t1,[$rp,#8*0]
    922 	sbcs	$t0,$acc4,$a4
    923 	ldp	$a2,$a3,[$np,#8*2]
    924 	sbcs	$t1,$acc5,$a5
    925 	stp	$t2,$t3,[$rp,#8*2]
    926 	sbcs	$t2,$acc6,$a6
    927 	ldp	$a4,$a5,[$np,#8*4]
    928 	sbcs	$t3,$acc7,$a7
    929 	ldp	$a6,$a7,[$np,#8*6]
    930 	add	$np,$np,#8*8
    931 	ldp	$acc0,$acc1,[$tp,#8*0]
    932 	sub	$cnt,$cnt,#8*8
    933 	ldp	$acc2,$acc3,[$tp,#8*2]
    934 	ldp	$acc4,$acc5,[$tp,#8*4]
    935 	ldp	$acc6,$acc7,[$tp,#8*6]
    936 	add	$tp,$tp,#8*8
    937 	stp	$t0,$t1,[$rp,#8*4]
    938 	sbcs	$t0,$acc0,$a0
    939 	stp	$t2,$t3,[$rp,#8*6]
    940 	add	$rp,$rp,#8*8
    941 	sbcs	$t1,$acc1,$a1
    942 	cbnz	$cnt,.Lsqr8x_sub
    943 
    944 	sbcs	$t2,$acc2,$a2
    945 	 mov	$tp,sp
    946 	 add	$ap,sp,$num
    947 	 ldp	$a0,$a1,[$ap_end,#8*0]
    948 	sbcs	$t3,$acc3,$a3
    949 	stp	$t0,$t1,[$rp,#8*0]
    950 	sbcs	$t0,$acc4,$a4
    951 	 ldp	$a2,$a3,[$ap_end,#8*2]
    952 	sbcs	$t1,$acc5,$a5
    953 	stp	$t2,$t3,[$rp,#8*2]
    954 	sbcs	$t2,$acc6,$a6
    955 	 ldp	$acc0,$acc1,[$ap,#8*0]
    956 	sbcs	$t3,$acc7,$a7
    957 	 ldp	$acc2,$acc3,[$ap,#8*2]
    958 	sbcs	xzr,$topmost,xzr	// did it borrow?
    959 	ldr	x30,[x29,#8]		// pull return address
    960 	stp	$t0,$t1,[$rp,#8*4]
    961 	stp	$t2,$t3,[$rp,#8*6]
    962 
    963 	sub	$cnt,$num,#8*4
    964 .Lsqr4x_cond_copy:
    965 	sub	$cnt,$cnt,#8*4
    966 	csel	$t0,$acc0,$a0,lo
    967 	 stp	xzr,xzr,[$tp,#8*0]
    968 	csel	$t1,$acc1,$a1,lo
    969 	ldp	$a0,$a1,[$ap_end,#8*4]
    970 	ldp	$acc0,$acc1,[$ap,#8*4]
    971 	csel	$t2,$acc2,$a2,lo
    972 	 stp	xzr,xzr,[$tp,#8*2]
    973 	 add	$tp,$tp,#8*4
    974 	csel	$t3,$acc3,$a3,lo
    975 	ldp	$a2,$a3,[$ap_end,#8*6]
    976 	ldp	$acc2,$acc3,[$ap,#8*6]
    977 	add	$ap,$ap,#8*4
    978 	stp	$t0,$t1,[$ap_end,#8*0]
    979 	stp	$t2,$t3,[$ap_end,#8*2]
    980 	add	$ap_end,$ap_end,#8*4
    981 	 stp	xzr,xzr,[$ap,#8*0]
    982 	 stp	xzr,xzr,[$ap,#8*2]
    983 	cbnz	$cnt,.Lsqr4x_cond_copy
    984 
    985 	csel	$t0,$acc0,$a0,lo
    986 	 stp	xzr,xzr,[$tp,#8*0]
    987 	csel	$t1,$acc1,$a1,lo
    988 	 stp	xzr,xzr,[$tp,#8*2]
    989 	csel	$t2,$acc2,$a2,lo
    990 	csel	$t3,$acc3,$a3,lo
    991 	stp	$t0,$t1,[$ap_end,#8*0]
    992 	stp	$t2,$t3,[$ap_end,#8*2]
    993 
    994 	b	.Lsqr8x_done
    995 
    996 .align	4
    997 .Lsqr8x8_post_condition:
    998 	adc	$carry,xzr,xzr
    999 	ldr	x30,[x29,#8]		// pull return address
   1000 	// $acc0-7,$carry hold result, $a0-7 hold modulus
   1001 	subs	$a0,$acc0,$a0
   1002 	ldr	$ap,[x29,#96]		// pull rp
   1003 	sbcs	$a1,$acc1,$a1
   1004 	 stp	xzr,xzr,[sp,#8*0]
   1005 	sbcs	$a2,$acc2,$a2
   1006 	 stp	xzr,xzr,[sp,#8*2]
   1007 	sbcs	$a3,$acc3,$a3
   1008 	 stp	xzr,xzr,[sp,#8*4]
   1009 	sbcs	$a4,$acc4,$a4
   1010 	 stp	xzr,xzr,[sp,#8*6]
   1011 	sbcs	$a5,$acc5,$a5
   1012 	 stp	xzr,xzr,[sp,#8*8]
   1013 	sbcs	$a6,$acc6,$a6
   1014 	 stp	xzr,xzr,[sp,#8*10]
   1015 	sbcs	$a7,$acc7,$a7
   1016 	 stp	xzr,xzr,[sp,#8*12]
   1017 	sbcs	$carry,$carry,xzr	// did it borrow?
   1018 	 stp	xzr,xzr,[sp,#8*14]
   1019 
   1020 	// $a0-7 hold result-modulus
   1021 	csel	$a0,$acc0,$a0,lo
   1022 	csel	$a1,$acc1,$a1,lo
   1023 	csel	$a2,$acc2,$a2,lo
   1024 	csel	$a3,$acc3,$a3,lo
   1025 	stp	$a0,$a1,[$ap,#8*0]
   1026 	csel	$a4,$acc4,$a4,lo
   1027 	csel	$a5,$acc5,$a5,lo
   1028 	stp	$a2,$a3,[$ap,#8*2]
   1029 	csel	$a6,$acc6,$a6,lo
   1030 	csel	$a7,$acc7,$a7,lo
   1031 	stp	$a4,$a5,[$ap,#8*4]
   1032 	stp	$a6,$a7,[$ap,#8*6]
   1033 
   1034 .Lsqr8x_done:
   1035 	ldp	x19,x20,[x29,#16]
   1036 	mov	sp,x29
   1037 	ldp	x21,x22,[x29,#32]
   1038 	mov	x0,#1
   1039 	ldp	x23,x24,[x29,#48]
   1040 	ldp	x25,x26,[x29,#64]
   1041 	ldp	x27,x28,[x29,#80]
   1042 	ldr	x29,[sp],#128
   1043 	ret
   1044 .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
   1045 ___
   1046 }
   1047 
   1048 {
   1049 ########################################################################
   1050 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
   1051 # x86_64-mont5 module, it's different in sense that it performs
   1052 # reduction 256 bits at a time.
   1053 
   1054 my ($a0,$a1,$a2,$a3,
   1055     $t0,$t1,$t2,$t3,
   1056     $m0,$m1,$m2,$m3,
   1057     $acc0,$acc1,$acc2,$acc3,$acc4,
   1058     $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
   1059 my  $bp_end=$rp;
   1060 my  ($carry,$topmost) = ($rp,"x30");
   1061 
   1062 $code.=<<___;
   1063 .type	__bn_mul4x_mont,%function
   1064 .align	5
   1065 __bn_mul4x_mont:
   1066 	stp	x29,x30,[sp,#-128]!
   1067 	add	x29,sp,#0
   1068 	stp	x19,x20,[sp,#16]
   1069 	stp	x21,x22,[sp,#32]
   1070 	stp	x23,x24,[sp,#48]
   1071 	stp	x25,x26,[sp,#64]
   1072 	stp	x27,x28,[sp,#80]
   1073 
   1074 	sub	$tp,sp,$num,lsl#3
   1075 	lsl	$num,$num,#3
   1076 	ldr	$n0,[$n0]		// *n0
   1077 	sub	sp,$tp,#8*4		// alloca
   1078 
   1079 	add	$t0,$bp,$num
   1080 	add	$ap_end,$ap,$num
   1081 	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
   1082 
   1083 	ldr	$bi,[$bp,#8*0]		// b[0]
   1084 	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
   1085 	ldp	$a2,$a3,[$ap,#8*2]
   1086 	add	$ap,$ap,#8*4
   1087 	mov	$acc0,xzr
   1088 	mov	$acc1,xzr
   1089 	mov	$acc2,xzr
   1090 	mov	$acc3,xzr
   1091 	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
   1092 	ldp	$m2,$m3,[$np,#8*2]
   1093 	adds	$np,$np,#8*4		// clear carry bit
   1094 	mov	$carry,xzr
   1095 	mov	$cnt,#0
   1096 	mov	$tp,sp
   1097 
   1098 .Loop_mul4x_1st_reduction:
   1099 	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
   1100 	adc	$carry,$carry,xzr	// modulo-scheduled
   1101 	mul	$t1,$a1,$bi
   1102 	add	$cnt,$cnt,#8
   1103 	mul	$t2,$a2,$bi
   1104 	and	$cnt,$cnt,#31
   1105 	mul	$t3,$a3,$bi
   1106 	adds	$acc0,$acc0,$t0
   1107 	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
   1108 	adcs	$acc1,$acc1,$t1
   1109 	mul	$mi,$acc0,$n0		// t[0]*n0
   1110 	adcs	$acc2,$acc2,$t2
   1111 	umulh	$t1,$a1,$bi
   1112 	adcs	$acc3,$acc3,$t3
   1113 	umulh	$t2,$a2,$bi
   1114 	adc	$acc4,xzr,xzr
   1115 	umulh	$t3,$a3,$bi
   1116 	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
   1117 	adds	$acc1,$acc1,$t0
   1118 	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
   1119 	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
   1120 	adcs	$acc2,$acc2,$t1
   1121 	mul	$t1,$m1,$mi
   1122 	adcs	$acc3,$acc3,$t2
   1123 	mul	$t2,$m2,$mi
   1124 	adc	$acc4,$acc4,$t3		// can't overflow
   1125 	mul	$t3,$m3,$mi
   1126 	// (*)	adds	xzr,$acc0,$t0
   1127 	subs	xzr,$acc0,#1		// (*)
   1128 	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
   1129 	adcs	$acc0,$acc1,$t1
   1130 	umulh	$t1,$m1,$mi
   1131 	adcs	$acc1,$acc2,$t2
   1132 	umulh	$t2,$m2,$mi
   1133 	adcs	$acc2,$acc3,$t3
   1134 	umulh	$t3,$m3,$mi
   1135 	adcs	$acc3,$acc4,$carry
   1136 	adc	$carry,xzr,xzr
   1137 	adds	$acc0,$acc0,$t0
   1138 	sub	$t0,$ap_end,$ap
   1139 	adcs	$acc1,$acc1,$t1
   1140 	adcs	$acc2,$acc2,$t2
   1141 	adcs	$acc3,$acc3,$t3
   1142 	//adc	$carry,$carry,xzr
   1143 	cbnz	$cnt,.Loop_mul4x_1st_reduction
   1144 
   1145 	cbz	$t0,.Lmul4x4_post_condition
   1146 
   1147 	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
   1148 	ldp	$a2,$a3,[$ap,#8*2]
   1149 	add	$ap,$ap,#8*4
   1150 	ldr	$mi,[sp]		// a[0]*n0
   1151 	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
   1152 	ldp	$m2,$m3,[$np,#8*2]
   1153 	add	$np,$np,#8*4
   1154 
   1155 .Loop_mul4x_1st_tail:
   1156 	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
   1157 	adc	$carry,$carry,xzr	// modulo-scheduled
   1158 	mul	$t1,$a1,$bi
   1159 	add	$cnt,$cnt,#8
   1160 	mul	$t2,$a2,$bi
   1161 	and	$cnt,$cnt,#31
   1162 	mul	$t3,$a3,$bi
   1163 	adds	$acc0,$acc0,$t0
   1164 	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
   1165 	adcs	$acc1,$acc1,$t1
   1166 	umulh	$t1,$a1,$bi
   1167 	adcs	$acc2,$acc2,$t2
   1168 	umulh	$t2,$a2,$bi
   1169 	adcs	$acc3,$acc3,$t3
   1170 	umulh	$t3,$a3,$bi
   1171 	adc	$acc4,xzr,xzr
   1172 	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
   1173 	adds	$acc1,$acc1,$t0
   1174 	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
   1175 	adcs	$acc2,$acc2,$t1
   1176 	mul	$t1,$m1,$mi
   1177 	adcs	$acc3,$acc3,$t2
   1178 	mul	$t2,$m2,$mi
   1179 	adc	$acc4,$acc4,$t3		// can't overflow
   1180 	mul	$t3,$m3,$mi
   1181 	adds	$acc0,$acc0,$t0
   1182 	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
   1183 	adcs	$acc1,$acc1,$t1
   1184 	umulh	$t1,$m1,$mi
   1185 	adcs	$acc2,$acc2,$t2
   1186 	umulh	$t2,$m2,$mi
   1187 	adcs	$acc3,$acc3,$t3
   1188 	adcs	$acc4,$acc4,$carry
   1189 	umulh	$t3,$m3,$mi
   1190 	adc	$carry,xzr,xzr
   1191 	ldr	$mi,[sp,$cnt]		// next t[0]*n0
   1192 	str	$acc0,[$tp],#8		// result!!!
   1193 	adds	$acc0,$acc1,$t0
   1194 	sub	$t0,$ap_end,$ap		// done yet?
   1195 	adcs	$acc1,$acc2,$t1
   1196 	adcs	$acc2,$acc3,$t2
   1197 	adcs	$acc3,$acc4,$t3
   1198 	//adc	$carry,$carry,xzr
   1199 	cbnz	$cnt,.Loop_mul4x_1st_tail
   1200 
   1201 	sub	$t1,$ap_end,$num	// rewinded $ap
   1202 	cbz	$t0,.Lmul4x_proceed
   1203 
   1204 	ldp	$a0,$a1,[$ap,#8*0]
   1205 	ldp	$a2,$a3,[$ap,#8*2]
   1206 	add	$ap,$ap,#8*4
   1207 	ldp	$m0,$m1,[$np,#8*0]
   1208 	ldp	$m2,$m3,[$np,#8*2]
   1209 	add	$np,$np,#8*4
   1210 	b	.Loop_mul4x_1st_tail
   1211 
   1212 .align	5
   1213 .Lmul4x_proceed:
   1214 	ldr	$bi,[$bp,#8*4]!		// *++b
   1215 	adc	$topmost,$carry,xzr
   1216 	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
   1217 	sub	$np,$np,$num		// rewind np
   1218 	ldp	$a2,$a3,[$t1,#8*2]
   1219 	add	$ap,$t1,#8*4
   1220 
   1221 	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
   1222 	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
   1223 	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
   1224 	ldp	$acc2,$acc3,[sp,#8*6]
   1225 
   1226 	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
   1227 	mov	$tp,sp
   1228 	ldp	$m2,$m3,[$np,#8*2]
   1229 	adds	$np,$np,#8*4		// clear carry bit
   1230 	mov	$carry,xzr
   1231 
   1232 .align	4
   1233 .Loop_mul4x_reduction:
   1234 	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
   1235 	adc	$carry,$carry,xzr	// modulo-scheduled
   1236 	mul	$t1,$a1,$bi
   1237 	add	$cnt,$cnt,#8
   1238 	mul	$t2,$a2,$bi
   1239 	and	$cnt,$cnt,#31
   1240 	mul	$t3,$a3,$bi
   1241 	adds	$acc0,$acc0,$t0
   1242 	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
   1243 	adcs	$acc1,$acc1,$t1
   1244 	mul	$mi,$acc0,$n0		// t[0]*n0
   1245 	adcs	$acc2,$acc2,$t2
   1246 	umulh	$t1,$a1,$bi
   1247 	adcs	$acc3,$acc3,$t3
   1248 	umulh	$t2,$a2,$bi
   1249 	adc	$acc4,xzr,xzr
   1250 	umulh	$t3,$a3,$bi
   1251 	ldr	$bi,[$bp,$cnt]		// next b[i]
   1252 	adds	$acc1,$acc1,$t0
   1253 	// (*)	mul	$t0,$m0,$mi
   1254 	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
   1255 	adcs	$acc2,$acc2,$t1
   1256 	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
   1257 	adcs	$acc3,$acc3,$t2
   1258 	mul	$t2,$m2,$mi
   1259 	adc	$acc4,$acc4,$t3		// can't overflow
   1260 	mul	$t3,$m3,$mi
   1261 	// (*)	adds	xzr,$acc0,$t0
   1262 	subs	xzr,$acc0,#1		// (*)
   1263 	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
   1264 	adcs	$acc0,$acc1,$t1
   1265 	umulh	$t1,$m1,$mi
   1266 	adcs	$acc1,$acc2,$t2
   1267 	umulh	$t2,$m2,$mi
   1268 	adcs	$acc2,$acc3,$t3
   1269 	umulh	$t3,$m3,$mi
   1270 	adcs	$acc3,$acc4,$carry
   1271 	adc	$carry,xzr,xzr
   1272 	adds	$acc0,$acc0,$t0
   1273 	adcs	$acc1,$acc1,$t1
   1274 	adcs	$acc2,$acc2,$t2
   1275 	adcs	$acc3,$acc3,$t3
   1276 	//adc	$carry,$carry,xzr
   1277 	cbnz	$cnt,.Loop_mul4x_reduction
   1278 
   1279 	adc	$carry,$carry,xzr
   1280 	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
   1281 	ldp	$t2,$t3,[$tp,#8*6]
   1282 	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
   1283 	ldp	$a2,$a3,[$ap,#8*2]
   1284 	add	$ap,$ap,#8*4
   1285 	adds	$acc0,$acc0,$t0
   1286 	adcs	$acc1,$acc1,$t1
   1287 	adcs	$acc2,$acc2,$t2
   1288 	adcs	$acc3,$acc3,$t3
   1289 	//adc	$carry,$carry,xzr
   1290 
   1291 	ldr	$mi,[sp]		// t[0]*n0
   1292 	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
   1293 	ldp	$m2,$m3,[$np,#8*2]
   1294 	add	$np,$np,#8*4
   1295 
   1296 .align	4
   1297 .Loop_mul4x_tail:
   1298 	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
   1299 	adc	$carry,$carry,xzr	// modulo-scheduled
   1300 	mul	$t1,$a1,$bi
   1301 	add	$cnt,$cnt,#8
   1302 	mul	$t2,$a2,$bi
   1303 	and	$cnt,$cnt,#31
   1304 	mul	$t3,$a3,$bi
   1305 	adds	$acc0,$acc0,$t0
   1306 	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
   1307 	adcs	$acc1,$acc1,$t1
   1308 	umulh	$t1,$a1,$bi
   1309 	adcs	$acc2,$acc2,$t2
   1310 	umulh	$t2,$a2,$bi
   1311 	adcs	$acc3,$acc3,$t3
   1312 	umulh	$t3,$a3,$bi
   1313 	adc	$acc4,xzr,xzr
   1314 	ldr	$bi,[$bp,$cnt]		// next b[i]
   1315 	adds	$acc1,$acc1,$t0
   1316 	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
   1317 	adcs	$acc2,$acc2,$t1
   1318 	mul	$t1,$m1,$mi
   1319 	adcs	$acc3,$acc3,$t2
   1320 	mul	$t2,$m2,$mi
   1321 	adc	$acc4,$acc4,$t3		// can't overflow
   1322 	mul	$t3,$m3,$mi
   1323 	adds	$acc0,$acc0,$t0
   1324 	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
   1325 	adcs	$acc1,$acc1,$t1
   1326 	umulh	$t1,$m1,$mi
   1327 	adcs	$acc2,$acc2,$t2
   1328 	umulh	$t2,$m2,$mi
   1329 	adcs	$acc3,$acc3,$t3
   1330 	umulh	$t3,$m3,$mi
   1331 	adcs	$acc4,$acc4,$carry
   1332 	ldr	$mi,[sp,$cnt]		// next a[0]*n0
   1333 	adc	$carry,xzr,xzr
   1334 	str	$acc0,[$tp],#8		// result!!!
   1335 	adds	$acc0,$acc1,$t0
   1336 	sub	$t0,$ap_end,$ap		// done yet?
   1337 	adcs	$acc1,$acc2,$t1
   1338 	adcs	$acc2,$acc3,$t2
   1339 	adcs	$acc3,$acc4,$t3
   1340 	//adc	$carry,$carry,xzr
   1341 	cbnz	$cnt,.Loop_mul4x_tail
   1342 
   1343 	sub	$t1,$np,$num		// rewinded np?
   1344 	adc	$carry,$carry,xzr
   1345 	cbz	$t0,.Loop_mul4x_break
   1346 
   1347 	ldp	$t0,$t1,[$tp,#8*4]
   1348 	ldp	$t2,$t3,[$tp,#8*6]
   1349 	ldp	$a0,$a1,[$ap,#8*0]
   1350 	ldp	$a2,$a3,[$ap,#8*2]
   1351 	add	$ap,$ap,#8*4
   1352 	adds	$acc0,$acc0,$t0
   1353 	adcs	$acc1,$acc1,$t1
   1354 	adcs	$acc2,$acc2,$t2
   1355 	adcs	$acc3,$acc3,$t3
   1356 	//adc	$carry,$carry,xzr
   1357 	ldp	$m0,$m1,[$np,#8*0]
   1358 	ldp	$m2,$m3,[$np,#8*2]
   1359 	add	$np,$np,#8*4
   1360 	b	.Loop_mul4x_tail
   1361 
   1362 .align	4
   1363 .Loop_mul4x_break:
   1364 	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
   1365 	adds	$acc0,$acc0,$topmost
   1366 	add	$bp,$bp,#8*4		// bp++
   1367 	adcs	$acc1,$acc1,xzr
   1368 	sub	$ap,$ap,$num		// rewind ap
   1369 	adcs	$acc2,$acc2,xzr
   1370 	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
   1371 	adcs	$acc3,$acc3,xzr
   1372 	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
   1373 	adc	$topmost,$carry,xzr
   1374 	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
   1375 	cmp	$bp,$t3			// done yet?
   1376 	ldp	$acc2,$acc3,[sp,#8*6]
   1377 	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
   1378 	ldp	$m2,$m3,[$t1,#8*2]
   1379 	add	$np,$t1,#8*4
   1380 	b.eq	.Lmul4x_post
   1381 
   1382 	ldr	$bi,[$bp]
   1383 	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
   1384 	ldp	$a2,$a3,[$ap,#8*2]
   1385 	adds	$ap,$ap,#8*4		// clear carry bit
   1386 	mov	$carry,xzr
   1387 	mov	$tp,sp
   1388 	b	.Loop_mul4x_reduction
   1389 
   1390 .align	4
   1391 .Lmul4x_post:
   1392 	// Final step. We see if result is larger than modulus, and
   1393 	// if it is, subtract the modulus. But comparison implies
   1394 	// subtraction. So we subtract modulus, see if it borrowed,
   1395 	// and conditionally copy original value.
   1396 	mov	$rp,$t2
   1397 	mov	$ap_end,$t2		// $rp copy
   1398 	subs	$t0,$acc0,$m0
   1399 	add	$tp,sp,#8*8
   1400 	sbcs	$t1,$acc1,$m1
   1401 	sub	$cnt,$num,#8*4
   1402 
   1403 .Lmul4x_sub:
   1404 	sbcs	$t2,$acc2,$m2
   1405 	ldp	$m0,$m1,[$np,#8*0]
   1406 	sub	$cnt,$cnt,#8*4
   1407 	ldp	$acc0,$acc1,[$tp,#8*0]
   1408 	sbcs	$t3,$acc3,$m3
   1409 	ldp	$m2,$m3,[$np,#8*2]
   1410 	add	$np,$np,#8*4
   1411 	ldp	$acc2,$acc3,[$tp,#8*2]
   1412 	add	$tp,$tp,#8*4
   1413 	stp	$t0,$t1,[$rp,#8*0]
   1414 	sbcs	$t0,$acc0,$m0
   1415 	stp	$t2,$t3,[$rp,#8*2]
   1416 	add	$rp,$rp,#8*4
   1417 	sbcs	$t1,$acc1,$m1
   1418 	cbnz	$cnt,.Lmul4x_sub
   1419 
   1420 	sbcs	$t2,$acc2,$m2
   1421 	 mov	$tp,sp
   1422 	 add	$ap,sp,#8*4
   1423 	 ldp	$a0,$a1,[$ap_end,#8*0]
   1424 	sbcs	$t3,$acc3,$m3
   1425 	stp	$t0,$t1,[$rp,#8*0]
   1426 	 ldp	$a2,$a3,[$ap_end,#8*2]
   1427 	stp	$t2,$t3,[$rp,#8*2]
   1428 	 ldp	$acc0,$acc1,[$ap,#8*0]
   1429 	 ldp	$acc2,$acc3,[$ap,#8*2]
   1430 	sbcs	xzr,$topmost,xzr	// did it borrow?
   1431 	ldr	x30,[x29,#8]		// pull return address
   1432 
   1433 	sub	$cnt,$num,#8*4
   1434 .Lmul4x_cond_copy:
   1435 	sub	$cnt,$cnt,#8*4
   1436 	csel	$t0,$acc0,$a0,lo
   1437 	 stp	xzr,xzr,[$tp,#8*0]
   1438 	csel	$t1,$acc1,$a1,lo
   1439 	ldp	$a0,$a1,[$ap_end,#8*4]
   1440 	ldp	$acc0,$acc1,[$ap,#8*4]
   1441 	csel	$t2,$acc2,$a2,lo
   1442 	 stp	xzr,xzr,[$tp,#8*2]
   1443 	 add	$tp,$tp,#8*4
   1444 	csel	$t3,$acc3,$a3,lo
   1445 	ldp	$a2,$a3,[$ap_end,#8*6]
   1446 	ldp	$acc2,$acc3,[$ap,#8*6]
   1447 	add	$ap,$ap,#8*4
   1448 	stp	$t0,$t1,[$ap_end,#8*0]
   1449 	stp	$t2,$t3,[$ap_end,#8*2]
   1450 	add	$ap_end,$ap_end,#8*4
   1451 	cbnz	$cnt,.Lmul4x_cond_copy
   1452 
   1453 	csel	$t0,$acc0,$a0,lo
   1454 	 stp	xzr,xzr,[$tp,#8*0]
   1455 	csel	$t1,$acc1,$a1,lo
   1456 	 stp	xzr,xzr,[$tp,#8*2]
   1457 	csel	$t2,$acc2,$a2,lo
   1458 	 stp	xzr,xzr,[$tp,#8*3]
   1459 	csel	$t3,$acc3,$a3,lo
   1460 	 stp	xzr,xzr,[$tp,#8*4]
   1461 	stp	$t0,$t1,[$ap_end,#8*0]
   1462 	stp	$t2,$t3,[$ap_end,#8*2]
   1463 
   1464 	b	.Lmul4x_done
   1465 
   1466 .align	4
   1467 .Lmul4x4_post_condition:
   1468 	adc	$carry,$carry,xzr
   1469 	ldr	$ap,[x29,#96]		// pull rp
   1470 	// $acc0-3,$carry hold result, $m0-7 hold modulus
   1471 	subs	$a0,$acc0,$m0
   1472 	ldr	x30,[x29,#8]		// pull return address
   1473 	sbcs	$a1,$acc1,$m1
   1474 	 stp	xzr,xzr,[sp,#8*0]
   1475 	sbcs	$a2,$acc2,$m2
   1476 	 stp	xzr,xzr,[sp,#8*2]
   1477 	sbcs	$a3,$acc3,$m3
   1478 	 stp	xzr,xzr,[sp,#8*4]
   1479 	sbcs	xzr,$carry,xzr		// did it borrow?
   1480 	 stp	xzr,xzr,[sp,#8*6]
   1481 
   1482 	// $a0-3 hold result-modulus
   1483 	csel	$a0,$acc0,$a0,lo
   1484 	csel	$a1,$acc1,$a1,lo
   1485 	csel	$a2,$acc2,$a2,lo
   1486 	csel	$a3,$acc3,$a3,lo
   1487 	stp	$a0,$a1,[$ap,#8*0]
   1488 	stp	$a2,$a3,[$ap,#8*2]
   1489 
   1490 .Lmul4x_done:
   1491 	ldp	x19,x20,[x29,#16]
   1492 	mov	sp,x29
   1493 	ldp	x21,x22,[x29,#32]
   1494 	mov	x0,#1
   1495 	ldp	x23,x24,[x29,#48]
   1496 	ldp	x25,x26,[x29,#64]
   1497 	ldp	x27,x28,[x29,#80]
   1498 	ldr	x29,[sp],#128
   1499 	ret
   1500 .size	__bn_mul4x_mont,.-__bn_mul4x_mont
   1501 ___
   1502 }
   1503 $code.=<<___;
   1504 .asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
   1505 .align	4
   1506 ___
   1507 
   1508 print $code;
   1509 
   1510 close STDOUT;
   1511