Home | History | Annotate | Download | only in fipsmodule
      1 // This file is generated from a similarly-named Perl script in the BoringSSL
      2 // source tree. Do not edit by hand.
      3 
      4 #if defined(__has_feature)
      5 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
      6 #define OPENSSL_NO_ASM
      7 #endif
      8 #endif
      9 
     10 #if !defined(OPENSSL_NO_ASM)
     11 #if defined(__aarch64__)
     12 #if defined(BORINGSSL_PREFIX)
     13 #include <boringssl_prefix_symbols_asm.h>
     14 #endif
     15 .text
     16 
     17 .globl	bn_mul_mont
     18 .hidden	bn_mul_mont
     19 .type	bn_mul_mont,%function
     20 .align	5
     21 bn_mul_mont:
     22 	tst	x5,#7
     23 	b.eq	__bn_sqr8x_mont
     24 	tst	x5,#3
     25 	b.eq	__bn_mul4x_mont
     26 .Lmul_mont:
     27 	stp	x29,x30,[sp,#-64]!
     28 	add	x29,sp,#0
     29 	stp	x19,x20,[sp,#16]
     30 	stp	x21,x22,[sp,#32]
     31 	stp	x23,x24,[sp,#48]
     32 
     33 	ldr	x9,[x2],#8		// bp[0]
     34 	sub	x22,sp,x5,lsl#3
     35 	ldp	x7,x8,[x1],#16	// ap[0..1]
     36 	lsl	x5,x5,#3
     37 	ldr	x4,[x4]		// *n0
     38 	and	x22,x22,#-16		// ABI says so
     39 	ldp	x13,x14,[x3],#16	// np[0..1]
     40 
     41 	mul	x6,x7,x9		// ap[0]*bp[0]
     42 	sub	x21,x5,#16		// j=num-2
     43 	umulh	x7,x7,x9
     44 	mul	x10,x8,x9		// ap[1]*bp[0]
     45 	umulh	x11,x8,x9
     46 
     47 	mul	x15,x6,x4		// "tp[0]"*n0
     48 	mov	sp,x22			// alloca
     49 
     50 	// (*)	mul	x12,x13,x15	// np[0]*m1
     51 	umulh	x13,x13,x15
     52 	mul	x16,x14,x15		// np[1]*m1
     53 	// (*)	adds	x12,x12,x6	// discarded
     54 	// (*)	As for removal of first multiplication and addition
     55 	//	instructions. The outcome of first addition is
     56 	//	guaranteed to be zero, which leaves two computationally
     57 	//	significant outcomes: it either carries or not. Then
     58 	//	question is when does it carry? Is there alternative
     59 	//	way to deduce it? If you follow operations, you can
     60 	//	observe that condition for carry is quite simple:
     61 	//	x6 being non-zero. So that carry can be calculated
     62 	//	by adding -1 to x6. That's what next instruction does.
     63 	subs	xzr,x6,#1		// (*)
     64 	umulh	x17,x14,x15
     65 	adc	x13,x13,xzr
     66 	cbz	x21,.L1st_skip
     67 
     68 .L1st:
     69 	ldr	x8,[x1],#8
     70 	adds	x6,x10,x7
     71 	sub	x21,x21,#8		// j--
     72 	adc	x7,x11,xzr
     73 
     74 	ldr	x14,[x3],#8
     75 	adds	x12,x16,x13
     76 	mul	x10,x8,x9		// ap[j]*bp[0]
     77 	adc	x13,x17,xzr
     78 	umulh	x11,x8,x9
     79 
     80 	adds	x12,x12,x6
     81 	mul	x16,x14,x15		// np[j]*m1
     82 	adc	x13,x13,xzr
     83 	umulh	x17,x14,x15
     84 	str	x12,[x22],#8		// tp[j-1]
     85 	cbnz	x21,.L1st
     86 
     87 .L1st_skip:
     88 	adds	x6,x10,x7
     89 	sub	x1,x1,x5		// rewind x1
     90 	adc	x7,x11,xzr
     91 
     92 	adds	x12,x16,x13
     93 	sub	x3,x3,x5		// rewind x3
     94 	adc	x13,x17,xzr
     95 
     96 	adds	x12,x12,x6
     97 	sub	x20,x5,#8		// i=num-1
     98 	adcs	x13,x13,x7
     99 
    100 	adc	x19,xzr,xzr		// upmost overflow bit
    101 	stp	x12,x13,[x22]
    102 
    103 .Louter:
    104 	ldr	x9,[x2],#8		// bp[i]
    105 	ldp	x7,x8,[x1],#16
    106 	ldr	x23,[sp]		// tp[0]
    107 	add	x22,sp,#8
    108 
    109 	mul	x6,x7,x9		// ap[0]*bp[i]
    110 	sub	x21,x5,#16		// j=num-2
    111 	umulh	x7,x7,x9
    112 	ldp	x13,x14,[x3],#16
    113 	mul	x10,x8,x9		// ap[1]*bp[i]
    114 	adds	x6,x6,x23
    115 	umulh	x11,x8,x9
    116 	adc	x7,x7,xzr
    117 
    118 	mul	x15,x6,x4
    119 	sub	x20,x20,#8		// i--
    120 
    121 	// (*)	mul	x12,x13,x15	// np[0]*m1
    122 	umulh	x13,x13,x15
    123 	mul	x16,x14,x15		// np[1]*m1
    124 	// (*)	adds	x12,x12,x6
    125 	subs	xzr,x6,#1		// (*)
    126 	umulh	x17,x14,x15
    127 	cbz	x21,.Linner_skip
    128 
    129 .Linner:
    130 	ldr	x8,[x1],#8
    131 	adc	x13,x13,xzr
    132 	ldr	x23,[x22],#8		// tp[j]
    133 	adds	x6,x10,x7
    134 	sub	x21,x21,#8		// j--
    135 	adc	x7,x11,xzr
    136 
    137 	adds	x12,x16,x13
    138 	ldr	x14,[x3],#8
    139 	adc	x13,x17,xzr
    140 
    141 	mul	x10,x8,x9		// ap[j]*bp[i]
    142 	adds	x6,x6,x23
    143 	umulh	x11,x8,x9
    144 	adc	x7,x7,xzr
    145 
    146 	mul	x16,x14,x15		// np[j]*m1
    147 	adds	x12,x12,x6
    148 	umulh	x17,x14,x15
    149 	str	x12,[x22,#-16]		// tp[j-1]
    150 	cbnz	x21,.Linner
    151 
    152 .Linner_skip:
    153 	ldr	x23,[x22],#8		// tp[j]
    154 	adc	x13,x13,xzr
    155 	adds	x6,x10,x7
    156 	sub	x1,x1,x5		// rewind x1
    157 	adc	x7,x11,xzr
    158 
    159 	adds	x12,x16,x13
    160 	sub	x3,x3,x5		// rewind x3
    161 	adcs	x13,x17,x19
    162 	adc	x19,xzr,xzr
    163 
    164 	adds	x6,x6,x23
    165 	adc	x7,x7,xzr
    166 
    167 	adds	x12,x12,x6
    168 	adcs	x13,x13,x7
    169 	adc	x19,x19,xzr		// upmost overflow bit
    170 	stp	x12,x13,[x22,#-16]
    171 
    172 	cbnz	x20,.Louter
    173 
    174 	// Final step. We see if result is larger than modulus, and
    175 	// if it is, subtract the modulus. But comparison implies
    176 	// subtraction. So we subtract modulus, see if it borrowed,
    177 	// and conditionally copy original value.
    178 	ldr	x23,[sp]		// tp[0]
    179 	add	x22,sp,#8
    180 	ldr	x14,[x3],#8		// np[0]
    181 	subs	x21,x5,#8		// j=num-1 and clear borrow
    182 	mov	x1,x0
    183 .Lsub:
    184 	sbcs	x8,x23,x14		// tp[j]-np[j]
    185 	ldr	x23,[x22],#8
    186 	sub	x21,x21,#8		// j--
    187 	ldr	x14,[x3],#8
    188 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
    189 	cbnz	x21,.Lsub
    190 
    191 	sbcs	x8,x23,x14
    192 	sbcs	x19,x19,xzr		// did it borrow?
    193 	str	x8,[x1],#8		// rp[num-1]
    194 
    195 	ldr	x23,[sp]		// tp[0]
    196 	add	x22,sp,#8
    197 	ldr	x8,[x0],#8		// rp[0]
    198 	sub	x5,x5,#8		// num--
    199 	nop
    200 .Lcond_copy:
    201 	sub	x5,x5,#8		// num--
    202 	csel	x14,x23,x8,lo		// did it borrow?
    203 	ldr	x23,[x22],#8
    204 	ldr	x8,[x0],#8
    205 	str	xzr,[x22,#-16]		// wipe tp
    206 	str	x14,[x0,#-16]
    207 	cbnz	x5,.Lcond_copy
    208 
    209 	csel	x14,x23,x8,lo
    210 	str	xzr,[x22,#-8]		// wipe tp
    211 	str	x14,[x0,#-8]
    212 
    213 	ldp	x19,x20,[x29,#16]
    214 	mov	sp,x29
    215 	ldp	x21,x22,[x29,#32]
    216 	mov	x0,#1
    217 	ldp	x23,x24,[x29,#48]
    218 	ldr	x29,[sp],#64
    219 	ret
    220 .size	bn_mul_mont,.-bn_mul_mont
    221 .type	__bn_sqr8x_mont,%function
    222 .align	5
    223 __bn_sqr8x_mont:
    224 	cmp	x1,x2
    225 	b.ne	__bn_mul4x_mont
    226 .Lsqr8x_mont:
    227 	stp	x29,x30,[sp,#-128]!
    228 	add	x29,sp,#0
    229 	stp	x19,x20,[sp,#16]
    230 	stp	x21,x22,[sp,#32]
    231 	stp	x23,x24,[sp,#48]
    232 	stp	x25,x26,[sp,#64]
    233 	stp	x27,x28,[sp,#80]
    234 	stp	x0,x3,[sp,#96]	// offload rp and np
    235 
    236 	ldp	x6,x7,[x1,#8*0]
    237 	ldp	x8,x9,[x1,#8*2]
    238 	ldp	x10,x11,[x1,#8*4]
    239 	ldp	x12,x13,[x1,#8*6]
    240 
    241 	sub	x2,sp,x5,lsl#4
    242 	lsl	x5,x5,#3
    243 	ldr	x4,[x4]		// *n0
    244 	mov	sp,x2			// alloca
    245 	sub	x27,x5,#8*8
    246 	b	.Lsqr8x_zero_start
    247 
    248 .Lsqr8x_zero:
    249 	sub	x27,x27,#8*8
    250 	stp	xzr,xzr,[x2,#8*0]
    251 	stp	xzr,xzr,[x2,#8*2]
    252 	stp	xzr,xzr,[x2,#8*4]
    253 	stp	xzr,xzr,[x2,#8*6]
    254 .Lsqr8x_zero_start:
    255 	stp	xzr,xzr,[x2,#8*8]
    256 	stp	xzr,xzr,[x2,#8*10]
    257 	stp	xzr,xzr,[x2,#8*12]
    258 	stp	xzr,xzr,[x2,#8*14]
    259 	add	x2,x2,#8*16
    260 	cbnz	x27,.Lsqr8x_zero
    261 
    262 	add	x3,x1,x5
    263 	add	x1,x1,#8*8
    264 	mov	x19,xzr
    265 	mov	x20,xzr
    266 	mov	x21,xzr
    267 	mov	x22,xzr
    268 	mov	x23,xzr
    269 	mov	x24,xzr
    270 	mov	x25,xzr
    271 	mov	x26,xzr
    272 	mov	x2,sp
    273 	str	x4,[x29,#112]		// offload n0
    274 
    275 	// Multiply everything but a[i]*a[i]
    276 .align	4
    277 .Lsqr8x_outer_loop:
    278         //                                                 a[1]a[0]	(i)
    279         //                                             a[2]a[0]
    280         //                                         a[3]a[0]
    281         //                                     a[4]a[0]
    282         //                                 a[5]a[0]
    283         //                             a[6]a[0]
    284         //                         a[7]a[0]
    285         //                                         a[2]a[1]		(ii)
    286         //                                     a[3]a[1]
    287         //                                 a[4]a[1]
    288         //                             a[5]a[1]
    289         //                         a[6]a[1]
    290         //                     a[7]a[1]
    291         //                                 a[3]a[2]			(iii)
    292         //                             a[4]a[2]
    293         //                         a[5]a[2]
    294         //                     a[6]a[2]
    295         //                 a[7]a[2]
    296         //                         a[4]a[3]				(iv)
    297         //                     a[5]a[3]
    298         //                 a[6]a[3]
    299         //             a[7]a[3]
    300         //                 a[5]a[4]					(v)
    301         //             a[6]a[4]
    302         //         a[7]a[4]
    303         //         a[6]a[5]						(vi)
    304         //     a[7]a[5]
    305         // a[7]a[6]							(vii)
    306 
    307 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
    308 	mul	x15,x8,x6
    309 	mul	x16,x9,x6
    310 	mul	x17,x10,x6
    311 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
    312 	mul	x14,x11,x6
    313 	adcs	x21,x21,x15
    314 	mul	x15,x12,x6
    315 	adcs	x22,x22,x16
    316 	mul	x16,x13,x6
    317 	adcs	x23,x23,x17
    318 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
    319 	adcs	x24,x24,x14
    320 	umulh	x14,x8,x6
    321 	adcs	x25,x25,x15
    322 	umulh	x15,x9,x6
    323 	adcs	x26,x26,x16
    324 	umulh	x16,x10,x6
    325 	stp	x19,x20,[x2],#8*2	// t[0..1]
    326 	adc	x19,xzr,xzr		// t[8]
    327 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
    328 	umulh	x17,x11,x6
    329 	adcs	x22,x22,x14
    330 	umulh	x14,x12,x6
    331 	adcs	x23,x23,x15
    332 	umulh	x15,x13,x6
    333 	adcs	x24,x24,x16
    334 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
    335 	adcs	x25,x25,x17
    336 	mul	x17,x9,x7
    337 	adcs	x26,x26,x14
    338 	mul	x14,x10,x7
    339 	adc	x19,x19,x15
    340 
    341 	mul	x15,x11,x7
    342 	adds	x22,x22,x16
    343 	mul	x16,x12,x7
    344 	adcs	x23,x23,x17
    345 	mul	x17,x13,x7
    346 	adcs	x24,x24,x14
    347 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
    348 	adcs	x25,x25,x15
    349 	umulh	x15,x9,x7
    350 	adcs	x26,x26,x16
    351 	umulh	x16,x10,x7
    352 	adcs	x19,x19,x17
    353 	umulh	x17,x11,x7
    354 	stp	x21,x22,[x2],#8*2	// t[2..3]
    355 	adc	x20,xzr,xzr		// t[9]
    356 	adds	x23,x23,x14
    357 	umulh	x14,x12,x7
    358 	adcs	x24,x24,x15
    359 	umulh	x15,x13,x7
    360 	adcs	x25,x25,x16
    361 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
    362 	adcs	x26,x26,x17
    363 	mul	x17,x10,x8
    364 	adcs	x19,x19,x14
    365 	mul	x14,x11,x8
    366 	adc	x20,x20,x15
    367 
    368 	mul	x15,x12,x8
    369 	adds	x24,x24,x16
    370 	mul	x16,x13,x8
    371 	adcs	x25,x25,x17
    372 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
    373 	adcs	x26,x26,x14
    374 	umulh	x14,x10,x8
    375 	adcs	x19,x19,x15
    376 	umulh	x15,x11,x8
    377 	adcs	x20,x20,x16
    378 	umulh	x16,x12,x8
    379 	stp	x23,x24,[x2],#8*2	// t[4..5]
    380 	adc	x21,xzr,xzr		// t[10]
    381 	adds	x25,x25,x17
    382 	umulh	x17,x13,x8
    383 	adcs	x26,x26,x14
    384 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
    385 	adcs	x19,x19,x15
    386 	mul	x15,x11,x9
    387 	adcs	x20,x20,x16
    388 	mul	x16,x12,x9
    389 	adc	x21,x21,x17
    390 
    391 	mul	x17,x13,x9
    392 	adds	x26,x26,x14
    393 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
    394 	adcs	x19,x19,x15
    395 	umulh	x15,x11,x9
    396 	adcs	x20,x20,x16
    397 	umulh	x16,x12,x9
    398 	adcs	x21,x21,x17
    399 	umulh	x17,x13,x9
    400 	stp	x25,x26,[x2],#8*2	// t[6..7]
    401 	adc	x22,xzr,xzr		// t[11]
    402 	adds	x19,x19,x14
    403 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
    404 	adcs	x20,x20,x15
    405 	mul	x15,x12,x10
    406 	adcs	x21,x21,x16
    407 	mul	x16,x13,x10
    408 	adc	x22,x22,x17
    409 
    410 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
    411 	adds	x20,x20,x14
    412 	umulh	x14,x12,x10
    413 	adcs	x21,x21,x15
    414 	umulh	x15,x13,x10
    415 	adcs	x22,x22,x16
    416 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
    417 	adc	x23,xzr,xzr		// t[12]
    418 	adds	x21,x21,x17
    419 	mul	x17,x13,x11
    420 	adcs	x22,x22,x14
    421 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
    422 	adc	x23,x23,x15
    423 
    424 	umulh	x15,x13,x11
    425 	adds	x22,x22,x16
    426 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
    427 	adcs	x23,x23,x17
    428 	umulh	x17,x13,x12		// hi(a[7]*a[6])
    429 	adc	x24,xzr,xzr		// t[13]
    430 	adds	x23,x23,x14
    431 	sub	x27,x3,x1	// done yet?
    432 	adc	x24,x24,x15
    433 
    434 	adds	x24,x24,x16
    435 	sub	x14,x3,x5	// rewinded ap
    436 	adc	x25,xzr,xzr		// t[14]
    437 	add	x25,x25,x17
    438 
    439 	cbz	x27,.Lsqr8x_outer_break
    440 
    441 	mov	x4,x6
    442 	ldp	x6,x7,[x2,#8*0]
    443 	ldp	x8,x9,[x2,#8*2]
    444 	ldp	x10,x11,[x2,#8*4]
    445 	ldp	x12,x13,[x2,#8*6]
    446 	adds	x19,x19,x6
    447 	adcs	x20,x20,x7
    448 	ldp	x6,x7,[x1,#8*0]
    449 	adcs	x21,x21,x8
    450 	adcs	x22,x22,x9
    451 	ldp	x8,x9,[x1,#8*2]
    452 	adcs	x23,x23,x10
    453 	adcs	x24,x24,x11
    454 	ldp	x10,x11,[x1,#8*4]
    455 	adcs	x25,x25,x12
    456 	mov	x0,x1
    457 	adcs	x26,xzr,x13
    458 	ldp	x12,x13,[x1,#8*6]
    459 	add	x1,x1,#8*8
    460 	//adc	x28,xzr,xzr		// moved below
    461 	mov	x27,#-8*8
    462 
    463 	//                                                         a[8]a[0]
    464 	//                                                     a[9]a[0]
    465 	//                                                 a[a]a[0]
    466 	//                                             a[b]a[0]
    467 	//                                         a[c]a[0]
    468 	//                                     a[d]a[0]
    469 	//                                 a[e]a[0]
    470 	//                             a[f]a[0]
    471 	//                                                     a[8]a[1]
    472 	//                         a[f]a[1]........................
    473 	//                                                 a[8]a[2]
    474 	//                     a[f]a[2]........................
    475 	//                                             a[8]a[3]
    476 	//                 a[f]a[3]........................
    477 	//                                         a[8]a[4]
    478 	//             a[f]a[4]........................
    479 	//                                     a[8]a[5]
    480 	//         a[f]a[5]........................
    481 	//                                 a[8]a[6]
    482 	//     a[f]a[6]........................
    483 	//                             a[8]a[7]
    484 	// a[f]a[7]........................
    485 .Lsqr8x_mul:
    486 	mul	x14,x6,x4
    487 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    488 	mul	x15,x7,x4
    489 	add	x27,x27,#8
    490 	mul	x16,x8,x4
    491 	mul	x17,x9,x4
    492 	adds	x19,x19,x14
    493 	mul	x14,x10,x4
    494 	adcs	x20,x20,x15
    495 	mul	x15,x11,x4
    496 	adcs	x21,x21,x16
    497 	mul	x16,x12,x4
    498 	adcs	x22,x22,x17
    499 	mul	x17,x13,x4
    500 	adcs	x23,x23,x14
    501 	umulh	x14,x6,x4
    502 	adcs	x24,x24,x15
    503 	umulh	x15,x7,x4
    504 	adcs	x25,x25,x16
    505 	umulh	x16,x8,x4
    506 	adcs	x26,x26,x17
    507 	umulh	x17,x9,x4
    508 	adc	x28,x28,xzr
    509 	str	x19,[x2],#8
    510 	adds	x19,x20,x14
    511 	umulh	x14,x10,x4
    512 	adcs	x20,x21,x15
    513 	umulh	x15,x11,x4
    514 	adcs	x21,x22,x16
    515 	umulh	x16,x12,x4
    516 	adcs	x22,x23,x17
    517 	umulh	x17,x13,x4
    518 	ldr	x4,[x0,x27]
    519 	adcs	x23,x24,x14
    520 	adcs	x24,x25,x15
    521 	adcs	x25,x26,x16
    522 	adcs	x26,x28,x17
    523 	//adc	x28,xzr,xzr		// moved above
    524 	cbnz	x27,.Lsqr8x_mul
    525 					// note that carry flag is guaranteed
    526 					// to be zero at this point
    527 	cmp	x1,x3		// done yet?
    528 	b.eq	.Lsqr8x_break
    529 
    530 	ldp	x6,x7,[x2,#8*0]
    531 	ldp	x8,x9,[x2,#8*2]
    532 	ldp	x10,x11,[x2,#8*4]
    533 	ldp	x12,x13,[x2,#8*6]
    534 	adds	x19,x19,x6
    535 	ldr	x4,[x0,#-8*8]
    536 	adcs	x20,x20,x7
    537 	ldp	x6,x7,[x1,#8*0]
    538 	adcs	x21,x21,x8
    539 	adcs	x22,x22,x9
    540 	ldp	x8,x9,[x1,#8*2]
    541 	adcs	x23,x23,x10
    542 	adcs	x24,x24,x11
    543 	ldp	x10,x11,[x1,#8*4]
    544 	adcs	x25,x25,x12
    545 	mov	x27,#-8*8
    546 	adcs	x26,x26,x13
    547 	ldp	x12,x13,[x1,#8*6]
    548 	add	x1,x1,#8*8
    549 	//adc	x28,xzr,xzr		// moved above
    550 	b	.Lsqr8x_mul
    551 
    552 .align	4
    553 .Lsqr8x_break:
    554 	ldp	x6,x7,[x0,#8*0]
    555 	add	x1,x0,#8*8
    556 	ldp	x8,x9,[x0,#8*2]
    557 	sub	x14,x3,x1		// is it last iteration?
    558 	ldp	x10,x11,[x0,#8*4]
    559 	sub	x15,x2,x14
    560 	ldp	x12,x13,[x0,#8*6]
    561 	cbz	x14,.Lsqr8x_outer_loop
    562 
    563 	stp	x19,x20,[x2,#8*0]
    564 	ldp	x19,x20,[x15,#8*0]
    565 	stp	x21,x22,[x2,#8*2]
    566 	ldp	x21,x22,[x15,#8*2]
    567 	stp	x23,x24,[x2,#8*4]
    568 	ldp	x23,x24,[x15,#8*4]
    569 	stp	x25,x26,[x2,#8*6]
    570 	mov	x2,x15
    571 	ldp	x25,x26,[x15,#8*6]
    572 	b	.Lsqr8x_outer_loop
    573 
    574 .align	4
    575 .Lsqr8x_outer_break:
    576 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
    577 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
    578 	ldp	x15,x16,[sp,#8*1]
    579 	ldp	x11,x13,[x14,#8*2]
    580 	add	x1,x14,#8*4
    581 	ldp	x17,x14,[sp,#8*3]
    582 
    583 	stp	x19,x20,[x2,#8*0]
    584 	mul	x19,x7,x7
    585 	stp	x21,x22,[x2,#8*2]
    586 	umulh	x7,x7,x7
    587 	stp	x23,x24,[x2,#8*4]
    588 	mul	x8,x9,x9
    589 	stp	x25,x26,[x2,#8*6]
    590 	mov	x2,sp
    591 	umulh	x9,x9,x9
    592 	adds	x20,x7,x15,lsl#1
    593 	extr	x15,x16,x15,#63
    594 	sub	x27,x5,#8*4
    595 
    596 .Lsqr4x_shift_n_add:
    597 	adcs	x21,x8,x15
    598 	extr	x16,x17,x16,#63
    599 	sub	x27,x27,#8*4
    600 	adcs	x22,x9,x16
    601 	ldp	x15,x16,[x2,#8*5]
    602 	mul	x10,x11,x11
    603 	ldp	x7,x9,[x1],#8*2
    604 	umulh	x11,x11,x11
    605 	mul	x12,x13,x13
    606 	umulh	x13,x13,x13
    607 	extr	x17,x14,x17,#63
    608 	stp	x19,x20,[x2,#8*0]
    609 	adcs	x23,x10,x17
    610 	extr	x14,x15,x14,#63
    611 	stp	x21,x22,[x2,#8*2]
    612 	adcs	x24,x11,x14
    613 	ldp	x17,x14,[x2,#8*7]
    614 	extr	x15,x16,x15,#63
    615 	adcs	x25,x12,x15
    616 	extr	x16,x17,x16,#63
    617 	adcs	x26,x13,x16
    618 	ldp	x15,x16,[x2,#8*9]
    619 	mul	x6,x7,x7
    620 	ldp	x11,x13,[x1],#8*2
    621 	umulh	x7,x7,x7
    622 	mul	x8,x9,x9
    623 	umulh	x9,x9,x9
    624 	stp	x23,x24,[x2,#8*4]
    625 	extr	x17,x14,x17,#63
    626 	stp	x25,x26,[x2,#8*6]
    627 	add	x2,x2,#8*8
    628 	adcs	x19,x6,x17
    629 	extr	x14,x15,x14,#63
    630 	adcs	x20,x7,x14
    631 	ldp	x17,x14,[x2,#8*3]
    632 	extr	x15,x16,x15,#63
    633 	cbnz	x27,.Lsqr4x_shift_n_add
    634 	ldp	x1,x4,[x29,#104]	// pull np and n0
    635 
    636 	adcs	x21,x8,x15
    637 	extr	x16,x17,x16,#63
    638 	adcs	x22,x9,x16
    639 	ldp	x15,x16,[x2,#8*5]
    640 	mul	x10,x11,x11
    641 	umulh	x11,x11,x11
    642 	stp	x19,x20,[x2,#8*0]
    643 	mul	x12,x13,x13
    644 	umulh	x13,x13,x13
    645 	stp	x21,x22,[x2,#8*2]
    646 	extr	x17,x14,x17,#63
    647 	adcs	x23,x10,x17
    648 	extr	x14,x15,x14,#63
    649 	ldp	x19,x20,[sp,#8*0]
    650 	adcs	x24,x11,x14
    651 	extr	x15,x16,x15,#63
    652 	ldp	x6,x7,[x1,#8*0]
    653 	adcs	x25,x12,x15
    654 	extr	x16,xzr,x16,#63
    655 	ldp	x8,x9,[x1,#8*2]
    656 	adc	x26,x13,x16
    657 	ldp	x10,x11,[x1,#8*4]
    658 
    659 	// Reduce by 512 bits per iteration
    660 	mul	x28,x4,x19		// t[0]*n0
    661 	ldp	x12,x13,[x1,#8*6]
    662 	add	x3,x1,x5
    663 	ldp	x21,x22,[sp,#8*2]
    664 	stp	x23,x24,[x2,#8*4]
    665 	ldp	x23,x24,[sp,#8*4]
    666 	stp	x25,x26,[x2,#8*6]
    667 	ldp	x25,x26,[sp,#8*6]
    668 	add	x1,x1,#8*8
    669 	mov	x30,xzr		// initial top-most carry
    670 	mov	x2,sp
    671 	mov	x27,#8
    672 
    673 .Lsqr8x_reduction:
    674 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
    675 	mul	x15,x7,x28
    676 	sub	x27,x27,#1
    677 	mul	x16,x8,x28
    678 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
    679 	mul	x17,x9,x28
    680 	// (*)	adds	xzr,x19,x14
    681 	subs	xzr,x19,#1		// (*)
    682 	mul	x14,x10,x28
    683 	adcs	x19,x20,x15
    684 	mul	x15,x11,x28
    685 	adcs	x20,x21,x16
    686 	mul	x16,x12,x28
    687 	adcs	x21,x22,x17
    688 	mul	x17,x13,x28
    689 	adcs	x22,x23,x14
    690 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
    691 	adcs	x23,x24,x15
    692 	umulh	x15,x7,x28
    693 	adcs	x24,x25,x16
    694 	umulh	x16,x8,x28
    695 	adcs	x25,x26,x17
    696 	umulh	x17,x9,x28
    697 	adc	x26,xzr,xzr
    698 	adds	x19,x19,x14
    699 	umulh	x14,x10,x28
    700 	adcs	x20,x20,x15
    701 	umulh	x15,x11,x28
    702 	adcs	x21,x21,x16
    703 	umulh	x16,x12,x28
    704 	adcs	x22,x22,x17
    705 	umulh	x17,x13,x28
    706 	mul	x28,x4,x19		// next t[0]*n0
    707 	adcs	x23,x23,x14
    708 	adcs	x24,x24,x15
    709 	adcs	x25,x25,x16
    710 	adc	x26,x26,x17
    711 	cbnz	x27,.Lsqr8x_reduction
    712 
    713 	ldp	x14,x15,[x2,#8*0]
    714 	ldp	x16,x17,[x2,#8*2]
    715 	mov	x0,x2
    716 	sub	x27,x3,x1	// done yet?
    717 	adds	x19,x19,x14
    718 	adcs	x20,x20,x15
    719 	ldp	x14,x15,[x2,#8*4]
    720 	adcs	x21,x21,x16
    721 	adcs	x22,x22,x17
    722 	ldp	x16,x17,[x2,#8*6]
    723 	adcs	x23,x23,x14
    724 	adcs	x24,x24,x15
    725 	adcs	x25,x25,x16
    726 	adcs	x26,x26,x17
    727 	//adc	x28,xzr,xzr		// moved below
    728 	cbz	x27,.Lsqr8x8_post_condition
    729 
    730 	ldr	x4,[x2,#-8*8]
    731 	ldp	x6,x7,[x1,#8*0]
    732 	ldp	x8,x9,[x1,#8*2]
    733 	ldp	x10,x11,[x1,#8*4]
    734 	mov	x27,#-8*8
    735 	ldp	x12,x13,[x1,#8*6]
    736 	add	x1,x1,#8*8
    737 
    738 .Lsqr8x_tail:
    739 	mul	x14,x6,x4
    740 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    741 	mul	x15,x7,x4
    742 	add	x27,x27,#8
    743 	mul	x16,x8,x4
    744 	mul	x17,x9,x4
    745 	adds	x19,x19,x14
    746 	mul	x14,x10,x4
    747 	adcs	x20,x20,x15
    748 	mul	x15,x11,x4
    749 	adcs	x21,x21,x16
    750 	mul	x16,x12,x4
    751 	adcs	x22,x22,x17
    752 	mul	x17,x13,x4
    753 	adcs	x23,x23,x14
    754 	umulh	x14,x6,x4
    755 	adcs	x24,x24,x15
    756 	umulh	x15,x7,x4
    757 	adcs	x25,x25,x16
    758 	umulh	x16,x8,x4
    759 	adcs	x26,x26,x17
    760 	umulh	x17,x9,x4
    761 	adc	x28,x28,xzr
    762 	str	x19,[x2],#8
    763 	adds	x19,x20,x14
    764 	umulh	x14,x10,x4
    765 	adcs	x20,x21,x15
    766 	umulh	x15,x11,x4
    767 	adcs	x21,x22,x16
    768 	umulh	x16,x12,x4
    769 	adcs	x22,x23,x17
    770 	umulh	x17,x13,x4
    771 	ldr	x4,[x0,x27]
    772 	adcs	x23,x24,x14
    773 	adcs	x24,x25,x15
    774 	adcs	x25,x26,x16
    775 	adcs	x26,x28,x17
    776 	//adc	x28,xzr,xzr		// moved above
    777 	cbnz	x27,.Lsqr8x_tail
    778 					// note that carry flag is guaranteed
    779 					// to be zero at this point
    780 	ldp	x6,x7,[x2,#8*0]
    781 	sub	x27,x3,x1	// done yet?
    782 	sub	x16,x3,x5	// rewinded np
    783 	ldp	x8,x9,[x2,#8*2]
    784 	ldp	x10,x11,[x2,#8*4]
    785 	ldp	x12,x13,[x2,#8*6]
    786 	cbz	x27,.Lsqr8x_tail_break
    787 
    788 	ldr	x4,[x0,#-8*8]
    789 	adds	x19,x19,x6
    790 	adcs	x20,x20,x7
    791 	ldp	x6,x7,[x1,#8*0]
    792 	adcs	x21,x21,x8
    793 	adcs	x22,x22,x9
    794 	ldp	x8,x9,[x1,#8*2]
    795 	adcs	x23,x23,x10
    796 	adcs	x24,x24,x11
    797 	ldp	x10,x11,[x1,#8*4]
    798 	adcs	x25,x25,x12
    799 	mov	x27,#-8*8
    800 	adcs	x26,x26,x13
    801 	ldp	x12,x13,[x1,#8*6]
    802 	add	x1,x1,#8*8
    803 	//adc	x28,xzr,xzr		// moved above
    804 	b	.Lsqr8x_tail
    805 
    806 .align	4
    807 .Lsqr8x_tail_break:
    808 	ldr	x4,[x29,#112]		// pull n0
    809 	add	x27,x2,#8*8		// end of current t[num] window
    810 
    811 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
    812 	adcs	x14,x19,x6
    813 	adcs	x15,x20,x7
    814 	ldp	x19,x20,[x0,#8*0]
    815 	adcs	x21,x21,x8
    816 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
    817 	adcs	x22,x22,x9
    818 	ldp	x8,x9,[x16,#8*2]
    819 	adcs	x23,x23,x10
    820 	adcs	x24,x24,x11
    821 	ldp	x10,x11,[x16,#8*4]
    822 	adcs	x25,x25,x12
    823 	adcs	x26,x26,x13
    824 	ldp	x12,x13,[x16,#8*6]
    825 	add	x1,x16,#8*8
    826 	adc	x30,xzr,xzr	// top-most carry
    827 	mul	x28,x4,x19
    828 	stp	x14,x15,[x2,#8*0]
    829 	stp	x21,x22,[x2,#8*2]
    830 	ldp	x21,x22,[x0,#8*2]
    831 	stp	x23,x24,[x2,#8*4]
    832 	ldp	x23,x24,[x0,#8*4]
    833 	cmp	x27,x29		// did we hit the bottom?
    834 	stp	x25,x26,[x2,#8*6]
    835 	mov	x2,x0			// slide the window
    836 	ldp	x25,x26,[x0,#8*6]
    837 	mov	x27,#8
    838 	b.ne	.Lsqr8x_reduction
    839 
    840 	// Final step. We see if result is larger than modulus, and
    841 	// if it is, subtract the modulus. But comparison implies
    842 	// subtraction. So we subtract modulus, see if it borrowed,
    843 	// and conditionally copy original value.
    844 	ldr	x0,[x29,#96]		// pull rp
    845 	add	x2,x2,#8*8
    846 	subs	x14,x19,x6
    847 	sbcs	x15,x20,x7
    848 	sub	x27,x5,#8*8
    849 	mov	x3,x0		// x0 copy
    850 
    851 .Lsqr8x_sub:
    852 	sbcs	x16,x21,x8
    853 	ldp	x6,x7,[x1,#8*0]
    854 	sbcs	x17,x22,x9
    855 	stp	x14,x15,[x0,#8*0]
    856 	sbcs	x14,x23,x10
    857 	ldp	x8,x9,[x1,#8*2]
    858 	sbcs	x15,x24,x11
    859 	stp	x16,x17,[x0,#8*2]
    860 	sbcs	x16,x25,x12
    861 	ldp	x10,x11,[x1,#8*4]
    862 	sbcs	x17,x26,x13
    863 	ldp	x12,x13,[x1,#8*6]
    864 	add	x1,x1,#8*8
    865 	ldp	x19,x20,[x2,#8*0]
    866 	sub	x27,x27,#8*8
    867 	ldp	x21,x22,[x2,#8*2]
    868 	ldp	x23,x24,[x2,#8*4]
    869 	ldp	x25,x26,[x2,#8*6]
    870 	add	x2,x2,#8*8
    871 	stp	x14,x15,[x0,#8*4]
    872 	sbcs	x14,x19,x6
    873 	stp	x16,x17,[x0,#8*6]
    874 	add	x0,x0,#8*8
    875 	sbcs	x15,x20,x7
    876 	cbnz	x27,.Lsqr8x_sub
    877 
    878 	sbcs	x16,x21,x8
    879 	mov	x2,sp
    880 	add	x1,sp,x5
    881 	ldp	x6,x7,[x3,#8*0]
    882 	sbcs	x17,x22,x9
    883 	stp	x14,x15,[x0,#8*0]
    884 	sbcs	x14,x23,x10
    885 	ldp	x8,x9,[x3,#8*2]
    886 	sbcs	x15,x24,x11
    887 	stp	x16,x17,[x0,#8*2]
    888 	sbcs	x16,x25,x12
    889 	ldp	x19,x20,[x1,#8*0]
    890 	sbcs	x17,x26,x13
    891 	ldp	x21,x22,[x1,#8*2]
    892 	sbcs	xzr,x30,xzr	// did it borrow?
    893 	ldr	x30,[x29,#8]		// pull return address
    894 	stp	x14,x15,[x0,#8*4]
    895 	stp	x16,x17,[x0,#8*6]
    896 
    897 	sub	x27,x5,#8*4
    898 .Lsqr4x_cond_copy:
    899 	sub	x27,x27,#8*4
    900 	csel	x14,x19,x6,lo
    901 	stp	xzr,xzr,[x2,#8*0]
    902 	csel	x15,x20,x7,lo
    903 	ldp	x6,x7,[x3,#8*4]
    904 	ldp	x19,x20,[x1,#8*4]
    905 	csel	x16,x21,x8,lo
    906 	stp	xzr,xzr,[x2,#8*2]
    907 	add	x2,x2,#8*4
    908 	csel	x17,x22,x9,lo
    909 	ldp	x8,x9,[x3,#8*6]
    910 	ldp	x21,x22,[x1,#8*6]
    911 	add	x1,x1,#8*4
    912 	stp	x14,x15,[x3,#8*0]
    913 	stp	x16,x17,[x3,#8*2]
    914 	add	x3,x3,#8*4
    915 	stp	xzr,xzr,[x1,#8*0]
    916 	stp	xzr,xzr,[x1,#8*2]
    917 	cbnz	x27,.Lsqr4x_cond_copy
    918 
    919 	csel	x14,x19,x6,lo
    920 	stp	xzr,xzr,[x2,#8*0]
    921 	csel	x15,x20,x7,lo
    922 	stp	xzr,xzr,[x2,#8*2]
    923 	csel	x16,x21,x8,lo
    924 	csel	x17,x22,x9,lo
    925 	stp	x14,x15,[x3,#8*0]
    926 	stp	x16,x17,[x3,#8*2]
    927 
    928 	b	.Lsqr8x_done
    929 
    930 .align	4
    931 .Lsqr8x8_post_condition:
    932 	adc	x28,xzr,xzr
    933 	ldr	x30,[x29,#8]		// pull return address
    934 	// x19-7,x28 hold result, x6-7 hold modulus
    935 	subs	x6,x19,x6
    936 	ldr	x1,[x29,#96]		// pull rp
    937 	sbcs	x7,x20,x7
    938 	stp	xzr,xzr,[sp,#8*0]
    939 	sbcs	x8,x21,x8
    940 	stp	xzr,xzr,[sp,#8*2]
    941 	sbcs	x9,x22,x9
    942 	stp	xzr,xzr,[sp,#8*4]
    943 	sbcs	x10,x23,x10
    944 	stp	xzr,xzr,[sp,#8*6]
    945 	sbcs	x11,x24,x11
    946 	stp	xzr,xzr,[sp,#8*8]
    947 	sbcs	x12,x25,x12
    948 	stp	xzr,xzr,[sp,#8*10]
    949 	sbcs	x13,x26,x13
    950 	stp	xzr,xzr,[sp,#8*12]
    951 	sbcs	x28,x28,xzr	// did it borrow?
    952 	stp	xzr,xzr,[sp,#8*14]
    953 
    954 	// x6-7 hold result-modulus
    955 	csel	x6,x19,x6,lo
    956 	csel	x7,x20,x7,lo
    957 	csel	x8,x21,x8,lo
    958 	csel	x9,x22,x9,lo
    959 	stp	x6,x7,[x1,#8*0]
    960 	csel	x10,x23,x10,lo
    961 	csel	x11,x24,x11,lo
    962 	stp	x8,x9,[x1,#8*2]
    963 	csel	x12,x25,x12,lo
    964 	csel	x13,x26,x13,lo
    965 	stp	x10,x11,[x1,#8*4]
    966 	stp	x12,x13,[x1,#8*6]
    967 
    968 .Lsqr8x_done:
    969 	ldp	x19,x20,[x29,#16]
    970 	mov	sp,x29
    971 	ldp	x21,x22,[x29,#32]
    972 	mov	x0,#1
    973 	ldp	x23,x24,[x29,#48]
    974 	ldp	x25,x26,[x29,#64]
    975 	ldp	x27,x28,[x29,#80]
    976 	ldr	x29,[sp],#128
    977 	ret
    978 .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
    979 .type	__bn_mul4x_mont,%function
    980 .align	5
    981 __bn_mul4x_mont:
    982 	stp	x29,x30,[sp,#-128]!
    983 	add	x29,sp,#0
    984 	stp	x19,x20,[sp,#16]
    985 	stp	x21,x22,[sp,#32]
    986 	stp	x23,x24,[sp,#48]
    987 	stp	x25,x26,[sp,#64]
    988 	stp	x27,x28,[sp,#80]
    989 
    990 	sub	x26,sp,x5,lsl#3
    991 	lsl	x5,x5,#3
    992 	ldr	x4,[x4]		// *n0
    993 	sub	sp,x26,#8*4		// alloca
    994 
    995 	add	x10,x2,x5
    996 	add	x27,x1,x5
    997 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
    998 
    999 	ldr	x24,[x2,#8*0]		// b[0]
   1000 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1001 	ldp	x8,x9,[x1,#8*2]
   1002 	add	x1,x1,#8*4
   1003 	mov	x19,xzr
   1004 	mov	x20,xzr
   1005 	mov	x21,xzr
   1006 	mov	x22,xzr
   1007 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1008 	ldp	x16,x17,[x3,#8*2]
   1009 	adds	x3,x3,#8*4		// clear carry bit
   1010 	mov	x0,xzr
   1011 	mov	x28,#0
   1012 	mov	x26,sp
   1013 
   1014 .Loop_mul4x_1st_reduction:
   1015 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
   1016 	adc	x0,x0,xzr	// modulo-scheduled
   1017 	mul	x11,x7,x24
   1018 	add	x28,x28,#8
   1019 	mul	x12,x8,x24
   1020 	and	x28,x28,#31
   1021 	mul	x13,x9,x24
   1022 	adds	x19,x19,x10
   1023 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
   1024 	adcs	x20,x20,x11
   1025 	mul	x25,x19,x4		// t[0]*n0
   1026 	adcs	x21,x21,x12
   1027 	umulh	x11,x7,x24
   1028 	adcs	x22,x22,x13
   1029 	umulh	x12,x8,x24
   1030 	adc	x23,xzr,xzr
   1031 	umulh	x13,x9,x24
   1032 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1033 	adds	x20,x20,x10
   1034 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
   1035 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1036 	adcs	x21,x21,x11
   1037 	mul	x11,x15,x25
   1038 	adcs	x22,x22,x12
   1039 	mul	x12,x16,x25
   1040 	adc	x23,x23,x13		// can't overflow
   1041 	mul	x13,x17,x25
   1042 	// (*)	adds	xzr,x19,x10
   1043 	subs	xzr,x19,#1		// (*)
   1044 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
   1045 	adcs	x19,x20,x11
   1046 	umulh	x11,x15,x25
   1047 	adcs	x20,x21,x12
   1048 	umulh	x12,x16,x25
   1049 	adcs	x21,x22,x13
   1050 	umulh	x13,x17,x25
   1051 	adcs	x22,x23,x0
   1052 	adc	x0,xzr,xzr
   1053 	adds	x19,x19,x10
   1054 	sub	x10,x27,x1
   1055 	adcs	x20,x20,x11
   1056 	adcs	x21,x21,x12
   1057 	adcs	x22,x22,x13
   1058 	//adc	x0,x0,xzr
   1059 	cbnz	x28,.Loop_mul4x_1st_reduction
   1060 
   1061 	cbz	x10,.Lmul4x4_post_condition
   1062 
   1063 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1064 	ldp	x8,x9,[x1,#8*2]
   1065 	add	x1,x1,#8*4
   1066 	ldr	x25,[sp]		// a[0]*n0
   1067 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1068 	ldp	x16,x17,[x3,#8*2]
   1069 	add	x3,x3,#8*4
   1070 
   1071 .Loop_mul4x_1st_tail:
   1072 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
   1073 	adc	x0,x0,xzr	// modulo-scheduled
   1074 	mul	x11,x7,x24
   1075 	add	x28,x28,#8
   1076 	mul	x12,x8,x24
   1077 	and	x28,x28,#31
   1078 	mul	x13,x9,x24
   1079 	adds	x19,x19,x10
   1080 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
   1081 	adcs	x20,x20,x11
   1082 	umulh	x11,x7,x24
   1083 	adcs	x21,x21,x12
   1084 	umulh	x12,x8,x24
   1085 	adcs	x22,x22,x13
   1086 	umulh	x13,x9,x24
   1087 	adc	x23,xzr,xzr
   1088 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1089 	adds	x20,x20,x10
   1090 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
   1091 	adcs	x21,x21,x11
   1092 	mul	x11,x15,x25
   1093 	adcs	x22,x22,x12
   1094 	mul	x12,x16,x25
   1095 	adc	x23,x23,x13		// can't overflow
   1096 	mul	x13,x17,x25
   1097 	adds	x19,x19,x10
   1098 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
   1099 	adcs	x20,x20,x11
   1100 	umulh	x11,x15,x25
   1101 	adcs	x21,x21,x12
   1102 	umulh	x12,x16,x25
   1103 	adcs	x22,x22,x13
   1104 	adcs	x23,x23,x0
   1105 	umulh	x13,x17,x25
   1106 	adc	x0,xzr,xzr
   1107 	ldr	x25,[sp,x28]		// next t[0]*n0
   1108 	str	x19,[x26],#8		// result!!!
   1109 	adds	x19,x20,x10
   1110 	sub	x10,x27,x1		// done yet?
   1111 	adcs	x20,x21,x11
   1112 	adcs	x21,x22,x12
   1113 	adcs	x22,x23,x13
   1114 	//adc	x0,x0,xzr
   1115 	cbnz	x28,.Loop_mul4x_1st_tail
   1116 
   1117 	sub	x11,x27,x5	// rewinded x1
   1118 	cbz	x10,.Lmul4x_proceed
   1119 
   1120 	ldp	x6,x7,[x1,#8*0]
   1121 	ldp	x8,x9,[x1,#8*2]
   1122 	add	x1,x1,#8*4
   1123 	ldp	x14,x15,[x3,#8*0]
   1124 	ldp	x16,x17,[x3,#8*2]
   1125 	add	x3,x3,#8*4
   1126 	b	.Loop_mul4x_1st_tail
   1127 
   1128 .align	5
   1129 .Lmul4x_proceed:
   1130 	ldr	x24,[x2,#8*4]!		// *++b
   1131 	adc	x30,x0,xzr
   1132 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
   1133 	sub	x3,x3,x5		// rewind np
   1134 	ldp	x8,x9,[x11,#8*2]
   1135 	add	x1,x11,#8*4
   1136 
   1137 	stp	x19,x20,[x26,#8*0]	// result!!!
   1138 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1139 	stp	x21,x22,[x26,#8*2]	// result!!!
   1140 	ldp	x21,x22,[sp,#8*6]
   1141 
   1142 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1143 	mov	x26,sp
   1144 	ldp	x16,x17,[x3,#8*2]
   1145 	adds	x3,x3,#8*4		// clear carry bit
   1146 	mov	x0,xzr
   1147 
   1148 .align	4
   1149 .Loop_mul4x_reduction:
   1150 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
   1151 	adc	x0,x0,xzr	// modulo-scheduled
   1152 	mul	x11,x7,x24
   1153 	add	x28,x28,#8
   1154 	mul	x12,x8,x24
   1155 	and	x28,x28,#31
   1156 	mul	x13,x9,x24
   1157 	adds	x19,x19,x10
   1158 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
   1159 	adcs	x20,x20,x11
   1160 	mul	x25,x19,x4		// t[0]*n0
   1161 	adcs	x21,x21,x12
   1162 	umulh	x11,x7,x24
   1163 	adcs	x22,x22,x13
   1164 	umulh	x12,x8,x24
   1165 	adc	x23,xzr,xzr
   1166 	umulh	x13,x9,x24
   1167 	ldr	x24,[x2,x28]		// next b[i]
   1168 	adds	x20,x20,x10
   1169 	// (*)	mul	x10,x14,x25
   1170 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1171 	adcs	x21,x21,x11
   1172 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
   1173 	adcs	x22,x22,x12
   1174 	mul	x12,x16,x25
   1175 	adc	x23,x23,x13		// can't overflow
   1176 	mul	x13,x17,x25
   1177 	// (*)	adds	xzr,x19,x10
   1178 	subs	xzr,x19,#1		// (*)
   1179 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
   1180 	adcs	x19,x20,x11
   1181 	umulh	x11,x15,x25
   1182 	adcs	x20,x21,x12
   1183 	umulh	x12,x16,x25
   1184 	adcs	x21,x22,x13
   1185 	umulh	x13,x17,x25
   1186 	adcs	x22,x23,x0
   1187 	adc	x0,xzr,xzr
   1188 	adds	x19,x19,x10
   1189 	adcs	x20,x20,x11
   1190 	adcs	x21,x21,x12
   1191 	adcs	x22,x22,x13
   1192 	//adc	x0,x0,xzr
   1193 	cbnz	x28,.Loop_mul4x_reduction
   1194 
   1195 	adc	x0,x0,xzr
   1196 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
   1197 	ldp	x12,x13,[x26,#8*6]
   1198 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1199 	ldp	x8,x9,[x1,#8*2]
   1200 	add	x1,x1,#8*4
   1201 	adds	x19,x19,x10
   1202 	adcs	x20,x20,x11
   1203 	adcs	x21,x21,x12
   1204 	adcs	x22,x22,x13
   1205 	//adc	x0,x0,xzr
   1206 
   1207 	ldr	x25,[sp]		// t[0]*n0
   1208 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1209 	ldp	x16,x17,[x3,#8*2]
   1210 	add	x3,x3,#8*4
   1211 
   1212 .align	4
   1213 .Loop_mul4x_tail:
   1214 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
   1215 	adc	x0,x0,xzr	// modulo-scheduled
   1216 	mul	x11,x7,x24
   1217 	add	x28,x28,#8
   1218 	mul	x12,x8,x24
   1219 	and	x28,x28,#31
   1220 	mul	x13,x9,x24
   1221 	adds	x19,x19,x10
   1222 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
   1223 	adcs	x20,x20,x11
   1224 	umulh	x11,x7,x24
   1225 	adcs	x21,x21,x12
   1226 	umulh	x12,x8,x24
   1227 	adcs	x22,x22,x13
   1228 	umulh	x13,x9,x24
   1229 	adc	x23,xzr,xzr
   1230 	ldr	x24,[x2,x28]		// next b[i]
   1231 	adds	x20,x20,x10
   1232 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
   1233 	adcs	x21,x21,x11
   1234 	mul	x11,x15,x25
   1235 	adcs	x22,x22,x12
   1236 	mul	x12,x16,x25
   1237 	adc	x23,x23,x13		// can't overflow
   1238 	mul	x13,x17,x25
   1239 	adds	x19,x19,x10
   1240 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
   1241 	adcs	x20,x20,x11
   1242 	umulh	x11,x15,x25
   1243 	adcs	x21,x21,x12
   1244 	umulh	x12,x16,x25
   1245 	adcs	x22,x22,x13
   1246 	umulh	x13,x17,x25
   1247 	adcs	x23,x23,x0
   1248 	ldr	x25,[sp,x28]		// next a[0]*n0
   1249 	adc	x0,xzr,xzr
   1250 	str	x19,[x26],#8		// result!!!
   1251 	adds	x19,x20,x10
   1252 	sub	x10,x27,x1		// done yet?
   1253 	adcs	x20,x21,x11
   1254 	adcs	x21,x22,x12
   1255 	adcs	x22,x23,x13
   1256 	//adc	x0,x0,xzr
   1257 	cbnz	x28,.Loop_mul4x_tail
   1258 
   1259 	sub	x11,x3,x5		// rewinded np?
   1260 	adc	x0,x0,xzr
   1261 	cbz	x10,.Loop_mul4x_break
   1262 
   1263 	ldp	x10,x11,[x26,#8*4]
   1264 	ldp	x12,x13,[x26,#8*6]
   1265 	ldp	x6,x7,[x1,#8*0]
   1266 	ldp	x8,x9,[x1,#8*2]
   1267 	add	x1,x1,#8*4
   1268 	adds	x19,x19,x10
   1269 	adcs	x20,x20,x11
   1270 	adcs	x21,x21,x12
   1271 	adcs	x22,x22,x13
   1272 	//adc	x0,x0,xzr
   1273 	ldp	x14,x15,[x3,#8*0]
   1274 	ldp	x16,x17,[x3,#8*2]
   1275 	add	x3,x3,#8*4
   1276 	b	.Loop_mul4x_tail
   1277 
   1278 .align	4
   1279 .Loop_mul4x_break:
   1280 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
   1281 	adds	x19,x19,x30
   1282 	add	x2,x2,#8*4		// bp++
   1283 	adcs	x20,x20,xzr
   1284 	sub	x1,x1,x5		// rewind ap
   1285 	adcs	x21,x21,xzr
   1286 	stp	x19,x20,[x26,#8*0]	// result!!!
   1287 	adcs	x22,x22,xzr
   1288 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1289 	adc	x30,x0,xzr
   1290 	stp	x21,x22,[x26,#8*2]	// result!!!
   1291 	cmp	x2,x13			// done yet?
   1292 	ldp	x21,x22,[sp,#8*6]
   1293 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
   1294 	ldp	x16,x17,[x11,#8*2]
   1295 	add	x3,x11,#8*4
   1296 	b.eq	.Lmul4x_post
   1297 
   1298 	ldr	x24,[x2]
   1299 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1300 	ldp	x8,x9,[x1,#8*2]
   1301 	adds	x1,x1,#8*4		// clear carry bit
   1302 	mov	x0,xzr
   1303 	mov	x26,sp
   1304 	b	.Loop_mul4x_reduction
   1305 
   1306 .align	4
   1307 .Lmul4x_post:
   1308 	// Final step. We see if result is larger than modulus, and
   1309 	// if it is, subtract the modulus. But comparison implies
   1310 	// subtraction. So we subtract modulus, see if it borrowed,
   1311 	// and conditionally copy original value.
   1312 	mov	x0,x12
   1313 	mov	x27,x12		// x0 copy
   1314 	subs	x10,x19,x14
   1315 	add	x26,sp,#8*8
   1316 	sbcs	x11,x20,x15
   1317 	sub	x28,x5,#8*4
   1318 
   1319 .Lmul4x_sub:
   1320 	sbcs	x12,x21,x16
   1321 	ldp	x14,x15,[x3,#8*0]
   1322 	sub	x28,x28,#8*4
   1323 	ldp	x19,x20,[x26,#8*0]
   1324 	sbcs	x13,x22,x17
   1325 	ldp	x16,x17,[x3,#8*2]
   1326 	add	x3,x3,#8*4
   1327 	ldp	x21,x22,[x26,#8*2]
   1328 	add	x26,x26,#8*4
   1329 	stp	x10,x11,[x0,#8*0]
   1330 	sbcs	x10,x19,x14
   1331 	stp	x12,x13,[x0,#8*2]
   1332 	add	x0,x0,#8*4
   1333 	sbcs	x11,x20,x15
   1334 	cbnz	x28,.Lmul4x_sub
   1335 
   1336 	sbcs	x12,x21,x16
   1337 	mov	x26,sp
   1338 	add	x1,sp,#8*4
   1339 	ldp	x6,x7,[x27,#8*0]
   1340 	sbcs	x13,x22,x17
   1341 	stp	x10,x11,[x0,#8*0]
   1342 	ldp	x8,x9,[x27,#8*2]
   1343 	stp	x12,x13,[x0,#8*2]
   1344 	ldp	x19,x20,[x1,#8*0]
   1345 	ldp	x21,x22,[x1,#8*2]
   1346 	sbcs	xzr,x30,xzr	// did it borrow?
   1347 	ldr	x30,[x29,#8]		// pull return address
   1348 
   1349 	sub	x28,x5,#8*4
   1350 .Lmul4x_cond_copy:
   1351 	sub	x28,x28,#8*4
   1352 	csel	x10,x19,x6,lo
   1353 	stp	xzr,xzr,[x26,#8*0]
   1354 	csel	x11,x20,x7,lo
   1355 	ldp	x6,x7,[x27,#8*4]
   1356 	ldp	x19,x20,[x1,#8*4]
   1357 	csel	x12,x21,x8,lo
   1358 	stp	xzr,xzr,[x26,#8*2]
   1359 	add	x26,x26,#8*4
   1360 	csel	x13,x22,x9,lo
   1361 	ldp	x8,x9,[x27,#8*6]
   1362 	ldp	x21,x22,[x1,#8*6]
   1363 	add	x1,x1,#8*4
   1364 	stp	x10,x11,[x27,#8*0]
   1365 	stp	x12,x13,[x27,#8*2]
   1366 	add	x27,x27,#8*4
   1367 	cbnz	x28,.Lmul4x_cond_copy
   1368 
   1369 	csel	x10,x19,x6,lo
   1370 	stp	xzr,xzr,[x26,#8*0]
   1371 	csel	x11,x20,x7,lo
   1372 	stp	xzr,xzr,[x26,#8*2]
   1373 	csel	x12,x21,x8,lo
   1374 	stp	xzr,xzr,[x26,#8*3]
   1375 	csel	x13,x22,x9,lo
   1376 	stp	xzr,xzr,[x26,#8*4]
   1377 	stp	x10,x11,[x27,#8*0]
   1378 	stp	x12,x13,[x27,#8*2]
   1379 
   1380 	b	.Lmul4x_done
   1381 
   1382 .align	4
   1383 .Lmul4x4_post_condition:
   1384 	adc	x0,x0,xzr
   1385 	ldr	x1,[x29,#96]		// pull rp
   1386 	// x19-3,x0 hold result, x14-7 hold modulus
   1387 	subs	x6,x19,x14
   1388 	ldr	x30,[x29,#8]		// pull return address
   1389 	sbcs	x7,x20,x15
   1390 	stp	xzr,xzr,[sp,#8*0]
   1391 	sbcs	x8,x21,x16
   1392 	stp	xzr,xzr,[sp,#8*2]
   1393 	sbcs	x9,x22,x17
   1394 	stp	xzr,xzr,[sp,#8*4]
   1395 	sbcs	xzr,x0,xzr		// did it borrow?
   1396 	stp	xzr,xzr,[sp,#8*6]
   1397 
   1398 	// x6-3 hold result-modulus
   1399 	csel	x6,x19,x6,lo
   1400 	csel	x7,x20,x7,lo
   1401 	csel	x8,x21,x8,lo
   1402 	csel	x9,x22,x9,lo
   1403 	stp	x6,x7,[x1,#8*0]
   1404 	stp	x8,x9,[x1,#8*2]
   1405 
   1406 .Lmul4x_done:
   1407 	ldp	x19,x20,[x29,#16]
   1408 	mov	sp,x29
   1409 	ldp	x21,x22,[x29,#32]
   1410 	mov	x0,#1
   1411 	ldp	x23,x24,[x29,#48]
   1412 	ldp	x25,x26,[x29,#64]
   1413 	ldp	x27,x28,[x29,#80]
   1414 	ldr	x29,[sp],#128
   1415 	ret
   1416 .size	__bn_mul4x_mont,.-__bn_mul4x_mont
   1417 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1418 .align	2
   1419 .align	4
   1420 #endif
   1421 #endif  // !OPENSSL_NO_ASM
   1422