Home | History | Annotate | Download | only in fipsmodule
      1 // This file is generated from a similarly-named Perl script in the BoringSSL
      2 // source tree. Do not edit by hand.
      3 
      4 #if defined(__has_feature)
      5 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
      6 #define OPENSSL_NO_ASM
      7 #endif
      8 #endif
      9 
     10 #if !defined(OPENSSL_NO_ASM)
     11 #if defined(BORINGSSL_PREFIX)
     12 #include <boringssl_prefix_symbols_asm.h>
     13 #endif
     14 .text
     15 
     16 .globl	_bn_mul_mont
     17 .private_extern	_bn_mul_mont
     18 
     19 .align	5
     20 _bn_mul_mont:
     21 	tst	x5,#7
     22 	b.eq	__bn_sqr8x_mont
     23 	tst	x5,#3
     24 	b.eq	__bn_mul4x_mont
     25 Lmul_mont:
     26 	stp	x29,x30,[sp,#-64]!
     27 	add	x29,sp,#0
     28 	stp	x19,x20,[sp,#16]
     29 	stp	x21,x22,[sp,#32]
     30 	stp	x23,x24,[sp,#48]
     31 
     32 	ldr	x9,[x2],#8		// bp[0]
     33 	sub	x22,sp,x5,lsl#3
     34 	ldp	x7,x8,[x1],#16	// ap[0..1]
     35 	lsl	x5,x5,#3
     36 	ldr	x4,[x4]		// *n0
     37 	and	x22,x22,#-16		// ABI says so
     38 	ldp	x13,x14,[x3],#16	// np[0..1]
     39 
     40 	mul	x6,x7,x9		// ap[0]*bp[0]
     41 	sub	x21,x5,#16		// j=num-2
     42 	umulh	x7,x7,x9
     43 	mul	x10,x8,x9		// ap[1]*bp[0]
     44 	umulh	x11,x8,x9
     45 
     46 	mul	x15,x6,x4		// "tp[0]"*n0
     47 	mov	sp,x22			// alloca
     48 
     49 	// (*)	mul	x12,x13,x15	// np[0]*m1
     50 	umulh	x13,x13,x15
     51 	mul	x16,x14,x15		// np[1]*m1
     52 	// (*)	adds	x12,x12,x6	// discarded
     53 	// (*)	As for removal of first multiplication and addition
     54 	//	instructions. The outcome of first addition is
     55 	//	guaranteed to be zero, which leaves two computationally
     56 	//	significant outcomes: it either carries or not. Then
     57 	//	question is when does it carry? Is there alternative
     58 	//	way to deduce it? If you follow operations, you can
     59 	//	observe that condition for carry is quite simple:
     60 	//	x6 being non-zero. So that carry can be calculated
     61 	//	by adding -1 to x6. That's what next instruction does.
     62 	subs	xzr,x6,#1		// (*)
     63 	umulh	x17,x14,x15
     64 	adc	x13,x13,xzr
     65 	cbz	x21,L1st_skip
     66 
     67 L1st:
     68 	ldr	x8,[x1],#8
     69 	adds	x6,x10,x7
     70 	sub	x21,x21,#8		// j--
     71 	adc	x7,x11,xzr
     72 
     73 	ldr	x14,[x3],#8
     74 	adds	x12,x16,x13
     75 	mul	x10,x8,x9		// ap[j]*bp[0]
     76 	adc	x13,x17,xzr
     77 	umulh	x11,x8,x9
     78 
     79 	adds	x12,x12,x6
     80 	mul	x16,x14,x15		// np[j]*m1
     81 	adc	x13,x13,xzr
     82 	umulh	x17,x14,x15
     83 	str	x12,[x22],#8		// tp[j-1]
     84 	cbnz	x21,L1st
     85 
     86 L1st_skip:
     87 	adds	x6,x10,x7
     88 	sub	x1,x1,x5		// rewind x1
     89 	adc	x7,x11,xzr
     90 
     91 	adds	x12,x16,x13
     92 	sub	x3,x3,x5		// rewind x3
     93 	adc	x13,x17,xzr
     94 
     95 	adds	x12,x12,x6
     96 	sub	x20,x5,#8		// i=num-1
     97 	adcs	x13,x13,x7
     98 
     99 	adc	x19,xzr,xzr		// upmost overflow bit
    100 	stp	x12,x13,[x22]
    101 
    102 Louter:
    103 	ldr	x9,[x2],#8		// bp[i]
    104 	ldp	x7,x8,[x1],#16
    105 	ldr	x23,[sp]		// tp[0]
    106 	add	x22,sp,#8
    107 
    108 	mul	x6,x7,x9		// ap[0]*bp[i]
    109 	sub	x21,x5,#16		// j=num-2
    110 	umulh	x7,x7,x9
    111 	ldp	x13,x14,[x3],#16
    112 	mul	x10,x8,x9		// ap[1]*bp[i]
    113 	adds	x6,x6,x23
    114 	umulh	x11,x8,x9
    115 	adc	x7,x7,xzr
    116 
    117 	mul	x15,x6,x4
    118 	sub	x20,x20,#8		// i--
    119 
    120 	// (*)	mul	x12,x13,x15	// np[0]*m1
    121 	umulh	x13,x13,x15
    122 	mul	x16,x14,x15		// np[1]*m1
    123 	// (*)	adds	x12,x12,x6
    124 	subs	xzr,x6,#1		// (*)
    125 	umulh	x17,x14,x15
    126 	cbz	x21,Linner_skip
    127 
    128 Linner:
    129 	ldr	x8,[x1],#8
    130 	adc	x13,x13,xzr
    131 	ldr	x23,[x22],#8		// tp[j]
    132 	adds	x6,x10,x7
    133 	sub	x21,x21,#8		// j--
    134 	adc	x7,x11,xzr
    135 
    136 	adds	x12,x16,x13
    137 	ldr	x14,[x3],#8
    138 	adc	x13,x17,xzr
    139 
    140 	mul	x10,x8,x9		// ap[j]*bp[i]
    141 	adds	x6,x6,x23
    142 	umulh	x11,x8,x9
    143 	adc	x7,x7,xzr
    144 
    145 	mul	x16,x14,x15		// np[j]*m1
    146 	adds	x12,x12,x6
    147 	umulh	x17,x14,x15
    148 	str	x12,[x22,#-16]		// tp[j-1]
    149 	cbnz	x21,Linner
    150 
    151 Linner_skip:
    152 	ldr	x23,[x22],#8		// tp[j]
    153 	adc	x13,x13,xzr
    154 	adds	x6,x10,x7
    155 	sub	x1,x1,x5		// rewind x1
    156 	adc	x7,x11,xzr
    157 
    158 	adds	x12,x16,x13
    159 	sub	x3,x3,x5		// rewind x3
    160 	adcs	x13,x17,x19
    161 	adc	x19,xzr,xzr
    162 
    163 	adds	x6,x6,x23
    164 	adc	x7,x7,xzr
    165 
    166 	adds	x12,x12,x6
    167 	adcs	x13,x13,x7
    168 	adc	x19,x19,xzr		// upmost overflow bit
    169 	stp	x12,x13,[x22,#-16]
    170 
    171 	cbnz	x20,Louter
    172 
    173 	// Final step. We see if result is larger than modulus, and
    174 	// if it is, subtract the modulus. But comparison implies
    175 	// subtraction. So we subtract modulus, see if it borrowed,
    176 	// and conditionally copy original value.
    177 	ldr	x23,[sp]		// tp[0]
    178 	add	x22,sp,#8
    179 	ldr	x14,[x3],#8		// np[0]
    180 	subs	x21,x5,#8		// j=num-1 and clear borrow
    181 	mov	x1,x0
    182 Lsub:
    183 	sbcs	x8,x23,x14		// tp[j]-np[j]
    184 	ldr	x23,[x22],#8
    185 	sub	x21,x21,#8		// j--
    186 	ldr	x14,[x3],#8
    187 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
    188 	cbnz	x21,Lsub
    189 
    190 	sbcs	x8,x23,x14
    191 	sbcs	x19,x19,xzr		// did it borrow?
    192 	str	x8,[x1],#8		// rp[num-1]
    193 
    194 	ldr	x23,[sp]		// tp[0]
    195 	add	x22,sp,#8
    196 	ldr	x8,[x0],#8		// rp[0]
    197 	sub	x5,x5,#8		// num--
    198 	nop
    199 Lcond_copy:
    200 	sub	x5,x5,#8		// num--
    201 	csel	x14,x23,x8,lo		// did it borrow?
    202 	ldr	x23,[x22],#8
    203 	ldr	x8,[x0],#8
    204 	str	xzr,[x22,#-16]		// wipe tp
    205 	str	x14,[x0,#-16]
    206 	cbnz	x5,Lcond_copy
    207 
    208 	csel	x14,x23,x8,lo
    209 	str	xzr,[x22,#-8]		// wipe tp
    210 	str	x14,[x0,#-8]
    211 
    212 	ldp	x19,x20,[x29,#16]
    213 	mov	sp,x29
    214 	ldp	x21,x22,[x29,#32]
    215 	mov	x0,#1
    216 	ldp	x23,x24,[x29,#48]
    217 	ldr	x29,[sp],#64
    218 	ret
    219 
    220 
    221 .align	5
    222 __bn_sqr8x_mont:
    223 	cmp	x1,x2
    224 	b.ne	__bn_mul4x_mont
    225 Lsqr8x_mont:
    226 	stp	x29,x30,[sp,#-128]!
    227 	add	x29,sp,#0
    228 	stp	x19,x20,[sp,#16]
    229 	stp	x21,x22,[sp,#32]
    230 	stp	x23,x24,[sp,#48]
    231 	stp	x25,x26,[sp,#64]
    232 	stp	x27,x28,[sp,#80]
    233 	stp	x0,x3,[sp,#96]	// offload rp and np
    234 
    235 	ldp	x6,x7,[x1,#8*0]
    236 	ldp	x8,x9,[x1,#8*2]
    237 	ldp	x10,x11,[x1,#8*4]
    238 	ldp	x12,x13,[x1,#8*6]
    239 
    240 	sub	x2,sp,x5,lsl#4
    241 	lsl	x5,x5,#3
    242 	ldr	x4,[x4]		// *n0
    243 	mov	sp,x2			// alloca
    244 	sub	x27,x5,#8*8
    245 	b	Lsqr8x_zero_start
    246 
    247 Lsqr8x_zero:
    248 	sub	x27,x27,#8*8
    249 	stp	xzr,xzr,[x2,#8*0]
    250 	stp	xzr,xzr,[x2,#8*2]
    251 	stp	xzr,xzr,[x2,#8*4]
    252 	stp	xzr,xzr,[x2,#8*6]
    253 Lsqr8x_zero_start:
    254 	stp	xzr,xzr,[x2,#8*8]
    255 	stp	xzr,xzr,[x2,#8*10]
    256 	stp	xzr,xzr,[x2,#8*12]
    257 	stp	xzr,xzr,[x2,#8*14]
    258 	add	x2,x2,#8*16
    259 	cbnz	x27,Lsqr8x_zero
    260 
    261 	add	x3,x1,x5
    262 	add	x1,x1,#8*8
    263 	mov	x19,xzr
    264 	mov	x20,xzr
    265 	mov	x21,xzr
    266 	mov	x22,xzr
    267 	mov	x23,xzr
    268 	mov	x24,xzr
    269 	mov	x25,xzr
    270 	mov	x26,xzr
    271 	mov	x2,sp
    272 	str	x4,[x29,#112]		// offload n0
    273 
    274 	// Multiply everything but a[i]*a[i]
    275 .align	4
    276 Lsqr8x_outer_loop:
    277         //                                                 a[1]a[0]	(i)
    278         //                                             a[2]a[0]
    279         //                                         a[3]a[0]
    280         //                                     a[4]a[0]
    281         //                                 a[5]a[0]
    282         //                             a[6]a[0]
    283         //                         a[7]a[0]
    284         //                                         a[2]a[1]		(ii)
    285         //                                     a[3]a[1]
    286         //                                 a[4]a[1]
    287         //                             a[5]a[1]
    288         //                         a[6]a[1]
    289         //                     a[7]a[1]
    290         //                                 a[3]a[2]			(iii)
    291         //                             a[4]a[2]
    292         //                         a[5]a[2]
    293         //                     a[6]a[2]
    294         //                 a[7]a[2]
    295         //                         a[4]a[3]				(iv)
    296         //                     a[5]a[3]
    297         //                 a[6]a[3]
    298         //             a[7]a[3]
    299         //                 a[5]a[4]					(v)
    300         //             a[6]a[4]
    301         //         a[7]a[4]
    302         //         a[6]a[5]						(vi)
    303         //     a[7]a[5]
    304         // a[7]a[6]							(vii)
    305 
    306 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
    307 	mul	x15,x8,x6
    308 	mul	x16,x9,x6
    309 	mul	x17,x10,x6
    310 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
    311 	mul	x14,x11,x6
    312 	adcs	x21,x21,x15
    313 	mul	x15,x12,x6
    314 	adcs	x22,x22,x16
    315 	mul	x16,x13,x6
    316 	adcs	x23,x23,x17
    317 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
    318 	adcs	x24,x24,x14
    319 	umulh	x14,x8,x6
    320 	adcs	x25,x25,x15
    321 	umulh	x15,x9,x6
    322 	adcs	x26,x26,x16
    323 	umulh	x16,x10,x6
    324 	stp	x19,x20,[x2],#8*2	// t[0..1]
    325 	adc	x19,xzr,xzr		// t[8]
    326 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
    327 	umulh	x17,x11,x6
    328 	adcs	x22,x22,x14
    329 	umulh	x14,x12,x6
    330 	adcs	x23,x23,x15
    331 	umulh	x15,x13,x6
    332 	adcs	x24,x24,x16
    333 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
    334 	adcs	x25,x25,x17
    335 	mul	x17,x9,x7
    336 	adcs	x26,x26,x14
    337 	mul	x14,x10,x7
    338 	adc	x19,x19,x15
    339 
    340 	mul	x15,x11,x7
    341 	adds	x22,x22,x16
    342 	mul	x16,x12,x7
    343 	adcs	x23,x23,x17
    344 	mul	x17,x13,x7
    345 	adcs	x24,x24,x14
    346 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
    347 	adcs	x25,x25,x15
    348 	umulh	x15,x9,x7
    349 	adcs	x26,x26,x16
    350 	umulh	x16,x10,x7
    351 	adcs	x19,x19,x17
    352 	umulh	x17,x11,x7
    353 	stp	x21,x22,[x2],#8*2	// t[2..3]
    354 	adc	x20,xzr,xzr		// t[9]
    355 	adds	x23,x23,x14
    356 	umulh	x14,x12,x7
    357 	adcs	x24,x24,x15
    358 	umulh	x15,x13,x7
    359 	adcs	x25,x25,x16
    360 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
    361 	adcs	x26,x26,x17
    362 	mul	x17,x10,x8
    363 	adcs	x19,x19,x14
    364 	mul	x14,x11,x8
    365 	adc	x20,x20,x15
    366 
    367 	mul	x15,x12,x8
    368 	adds	x24,x24,x16
    369 	mul	x16,x13,x8
    370 	adcs	x25,x25,x17
    371 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
    372 	adcs	x26,x26,x14
    373 	umulh	x14,x10,x8
    374 	adcs	x19,x19,x15
    375 	umulh	x15,x11,x8
    376 	adcs	x20,x20,x16
    377 	umulh	x16,x12,x8
    378 	stp	x23,x24,[x2],#8*2	// t[4..5]
    379 	adc	x21,xzr,xzr		// t[10]
    380 	adds	x25,x25,x17
    381 	umulh	x17,x13,x8
    382 	adcs	x26,x26,x14
    383 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
    384 	adcs	x19,x19,x15
    385 	mul	x15,x11,x9
    386 	adcs	x20,x20,x16
    387 	mul	x16,x12,x9
    388 	adc	x21,x21,x17
    389 
    390 	mul	x17,x13,x9
    391 	adds	x26,x26,x14
    392 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
    393 	adcs	x19,x19,x15
    394 	umulh	x15,x11,x9
    395 	adcs	x20,x20,x16
    396 	umulh	x16,x12,x9
    397 	adcs	x21,x21,x17
    398 	umulh	x17,x13,x9
    399 	stp	x25,x26,[x2],#8*2	// t[6..7]
    400 	adc	x22,xzr,xzr		// t[11]
    401 	adds	x19,x19,x14
    402 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
    403 	adcs	x20,x20,x15
    404 	mul	x15,x12,x10
    405 	adcs	x21,x21,x16
    406 	mul	x16,x13,x10
    407 	adc	x22,x22,x17
    408 
    409 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
    410 	adds	x20,x20,x14
    411 	umulh	x14,x12,x10
    412 	adcs	x21,x21,x15
    413 	umulh	x15,x13,x10
    414 	adcs	x22,x22,x16
    415 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
    416 	adc	x23,xzr,xzr		// t[12]
    417 	adds	x21,x21,x17
    418 	mul	x17,x13,x11
    419 	adcs	x22,x22,x14
    420 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
    421 	adc	x23,x23,x15
    422 
    423 	umulh	x15,x13,x11
    424 	adds	x22,x22,x16
    425 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
    426 	adcs	x23,x23,x17
    427 	umulh	x17,x13,x12		// hi(a[7]*a[6])
    428 	adc	x24,xzr,xzr		// t[13]
    429 	adds	x23,x23,x14
    430 	sub	x27,x3,x1	// done yet?
    431 	adc	x24,x24,x15
    432 
    433 	adds	x24,x24,x16
    434 	sub	x14,x3,x5	// rewinded ap
    435 	adc	x25,xzr,xzr		// t[14]
    436 	add	x25,x25,x17
    437 
    438 	cbz	x27,Lsqr8x_outer_break
    439 
    440 	mov	x4,x6
    441 	ldp	x6,x7,[x2,#8*0]
    442 	ldp	x8,x9,[x2,#8*2]
    443 	ldp	x10,x11,[x2,#8*4]
    444 	ldp	x12,x13,[x2,#8*6]
    445 	adds	x19,x19,x6
    446 	adcs	x20,x20,x7
    447 	ldp	x6,x7,[x1,#8*0]
    448 	adcs	x21,x21,x8
    449 	adcs	x22,x22,x9
    450 	ldp	x8,x9,[x1,#8*2]
    451 	adcs	x23,x23,x10
    452 	adcs	x24,x24,x11
    453 	ldp	x10,x11,[x1,#8*4]
    454 	adcs	x25,x25,x12
    455 	mov	x0,x1
    456 	adcs	x26,xzr,x13
    457 	ldp	x12,x13,[x1,#8*6]
    458 	add	x1,x1,#8*8
    459 	//adc	x28,xzr,xzr		// moved below
    460 	mov	x27,#-8*8
    461 
    462 	//                                                         a[8]a[0]
    463 	//                                                     a[9]a[0]
    464 	//                                                 a[a]a[0]
    465 	//                                             a[b]a[0]
    466 	//                                         a[c]a[0]
    467 	//                                     a[d]a[0]
    468 	//                                 a[e]a[0]
    469 	//                             a[f]a[0]
    470 	//                                                     a[8]a[1]
    471 	//                         a[f]a[1]........................
    472 	//                                                 a[8]a[2]
    473 	//                     a[f]a[2]........................
    474 	//                                             a[8]a[3]
    475 	//                 a[f]a[3]........................
    476 	//                                         a[8]a[4]
    477 	//             a[f]a[4]........................
    478 	//                                     a[8]a[5]
    479 	//         a[f]a[5]........................
    480 	//                                 a[8]a[6]
    481 	//     a[f]a[6]........................
    482 	//                             a[8]a[7]
    483 	// a[f]a[7]........................
    484 Lsqr8x_mul:
    485 	mul	x14,x6,x4
    486 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    487 	mul	x15,x7,x4
    488 	add	x27,x27,#8
    489 	mul	x16,x8,x4
    490 	mul	x17,x9,x4
    491 	adds	x19,x19,x14
    492 	mul	x14,x10,x4
    493 	adcs	x20,x20,x15
    494 	mul	x15,x11,x4
    495 	adcs	x21,x21,x16
    496 	mul	x16,x12,x4
    497 	adcs	x22,x22,x17
    498 	mul	x17,x13,x4
    499 	adcs	x23,x23,x14
    500 	umulh	x14,x6,x4
    501 	adcs	x24,x24,x15
    502 	umulh	x15,x7,x4
    503 	adcs	x25,x25,x16
    504 	umulh	x16,x8,x4
    505 	adcs	x26,x26,x17
    506 	umulh	x17,x9,x4
    507 	adc	x28,x28,xzr
    508 	str	x19,[x2],#8
    509 	adds	x19,x20,x14
    510 	umulh	x14,x10,x4
    511 	adcs	x20,x21,x15
    512 	umulh	x15,x11,x4
    513 	adcs	x21,x22,x16
    514 	umulh	x16,x12,x4
    515 	adcs	x22,x23,x17
    516 	umulh	x17,x13,x4
    517 	ldr	x4,[x0,x27]
    518 	adcs	x23,x24,x14
    519 	adcs	x24,x25,x15
    520 	adcs	x25,x26,x16
    521 	adcs	x26,x28,x17
    522 	//adc	x28,xzr,xzr		// moved above
    523 	cbnz	x27,Lsqr8x_mul
    524 					// note that carry flag is guaranteed
    525 					// to be zero at this point
    526 	cmp	x1,x3		// done yet?
    527 	b.eq	Lsqr8x_break
    528 
    529 	ldp	x6,x7,[x2,#8*0]
    530 	ldp	x8,x9,[x2,#8*2]
    531 	ldp	x10,x11,[x2,#8*4]
    532 	ldp	x12,x13,[x2,#8*6]
    533 	adds	x19,x19,x6
    534 	ldr	x4,[x0,#-8*8]
    535 	adcs	x20,x20,x7
    536 	ldp	x6,x7,[x1,#8*0]
    537 	adcs	x21,x21,x8
    538 	adcs	x22,x22,x9
    539 	ldp	x8,x9,[x1,#8*2]
    540 	adcs	x23,x23,x10
    541 	adcs	x24,x24,x11
    542 	ldp	x10,x11,[x1,#8*4]
    543 	adcs	x25,x25,x12
    544 	mov	x27,#-8*8
    545 	adcs	x26,x26,x13
    546 	ldp	x12,x13,[x1,#8*6]
    547 	add	x1,x1,#8*8
    548 	//adc	x28,xzr,xzr		// moved above
    549 	b	Lsqr8x_mul
    550 
    551 .align	4
    552 Lsqr8x_break:
    553 	ldp	x6,x7,[x0,#8*0]
    554 	add	x1,x0,#8*8
    555 	ldp	x8,x9,[x0,#8*2]
    556 	sub	x14,x3,x1		// is it last iteration?
    557 	ldp	x10,x11,[x0,#8*4]
    558 	sub	x15,x2,x14
    559 	ldp	x12,x13,[x0,#8*6]
    560 	cbz	x14,Lsqr8x_outer_loop
    561 
    562 	stp	x19,x20,[x2,#8*0]
    563 	ldp	x19,x20,[x15,#8*0]
    564 	stp	x21,x22,[x2,#8*2]
    565 	ldp	x21,x22,[x15,#8*2]
    566 	stp	x23,x24,[x2,#8*4]
    567 	ldp	x23,x24,[x15,#8*4]
    568 	stp	x25,x26,[x2,#8*6]
    569 	mov	x2,x15
    570 	ldp	x25,x26,[x15,#8*6]
    571 	b	Lsqr8x_outer_loop
    572 
    573 .align	4
    574 Lsqr8x_outer_break:
    575 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
    576 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
    577 	ldp	x15,x16,[sp,#8*1]
    578 	ldp	x11,x13,[x14,#8*2]
    579 	add	x1,x14,#8*4
    580 	ldp	x17,x14,[sp,#8*3]
    581 
    582 	stp	x19,x20,[x2,#8*0]
    583 	mul	x19,x7,x7
    584 	stp	x21,x22,[x2,#8*2]
    585 	umulh	x7,x7,x7
    586 	stp	x23,x24,[x2,#8*4]
    587 	mul	x8,x9,x9
    588 	stp	x25,x26,[x2,#8*6]
    589 	mov	x2,sp
    590 	umulh	x9,x9,x9
    591 	adds	x20,x7,x15,lsl#1
    592 	extr	x15,x16,x15,#63
    593 	sub	x27,x5,#8*4
    594 
    595 Lsqr4x_shift_n_add:
    596 	adcs	x21,x8,x15
    597 	extr	x16,x17,x16,#63
    598 	sub	x27,x27,#8*4
    599 	adcs	x22,x9,x16
    600 	ldp	x15,x16,[x2,#8*5]
    601 	mul	x10,x11,x11
    602 	ldp	x7,x9,[x1],#8*2
    603 	umulh	x11,x11,x11
    604 	mul	x12,x13,x13
    605 	umulh	x13,x13,x13
    606 	extr	x17,x14,x17,#63
    607 	stp	x19,x20,[x2,#8*0]
    608 	adcs	x23,x10,x17
    609 	extr	x14,x15,x14,#63
    610 	stp	x21,x22,[x2,#8*2]
    611 	adcs	x24,x11,x14
    612 	ldp	x17,x14,[x2,#8*7]
    613 	extr	x15,x16,x15,#63
    614 	adcs	x25,x12,x15
    615 	extr	x16,x17,x16,#63
    616 	adcs	x26,x13,x16
    617 	ldp	x15,x16,[x2,#8*9]
    618 	mul	x6,x7,x7
    619 	ldp	x11,x13,[x1],#8*2
    620 	umulh	x7,x7,x7
    621 	mul	x8,x9,x9
    622 	umulh	x9,x9,x9
    623 	stp	x23,x24,[x2,#8*4]
    624 	extr	x17,x14,x17,#63
    625 	stp	x25,x26,[x2,#8*6]
    626 	add	x2,x2,#8*8
    627 	adcs	x19,x6,x17
    628 	extr	x14,x15,x14,#63
    629 	adcs	x20,x7,x14
    630 	ldp	x17,x14,[x2,#8*3]
    631 	extr	x15,x16,x15,#63
    632 	cbnz	x27,Lsqr4x_shift_n_add
    633 	ldp	x1,x4,[x29,#104]	// pull np and n0
    634 
    635 	adcs	x21,x8,x15
    636 	extr	x16,x17,x16,#63
    637 	adcs	x22,x9,x16
    638 	ldp	x15,x16,[x2,#8*5]
    639 	mul	x10,x11,x11
    640 	umulh	x11,x11,x11
    641 	stp	x19,x20,[x2,#8*0]
    642 	mul	x12,x13,x13
    643 	umulh	x13,x13,x13
    644 	stp	x21,x22,[x2,#8*2]
    645 	extr	x17,x14,x17,#63
    646 	adcs	x23,x10,x17
    647 	extr	x14,x15,x14,#63
    648 	ldp	x19,x20,[sp,#8*0]
    649 	adcs	x24,x11,x14
    650 	extr	x15,x16,x15,#63
    651 	ldp	x6,x7,[x1,#8*0]
    652 	adcs	x25,x12,x15
    653 	extr	x16,xzr,x16,#63
    654 	ldp	x8,x9,[x1,#8*2]
    655 	adc	x26,x13,x16
    656 	ldp	x10,x11,[x1,#8*4]
    657 
    658 	// Reduce by 512 bits per iteration
    659 	mul	x28,x4,x19		// t[0]*n0
    660 	ldp	x12,x13,[x1,#8*6]
    661 	add	x3,x1,x5
    662 	ldp	x21,x22,[sp,#8*2]
    663 	stp	x23,x24,[x2,#8*4]
    664 	ldp	x23,x24,[sp,#8*4]
    665 	stp	x25,x26,[x2,#8*6]
    666 	ldp	x25,x26,[sp,#8*6]
    667 	add	x1,x1,#8*8
    668 	mov	x30,xzr		// initial top-most carry
    669 	mov	x2,sp
    670 	mov	x27,#8
    671 
    672 Lsqr8x_reduction:
    673 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
    674 	mul	x15,x7,x28
    675 	sub	x27,x27,#1
    676 	mul	x16,x8,x28
    677 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
    678 	mul	x17,x9,x28
    679 	// (*)	adds	xzr,x19,x14
    680 	subs	xzr,x19,#1		// (*)
    681 	mul	x14,x10,x28
    682 	adcs	x19,x20,x15
    683 	mul	x15,x11,x28
    684 	adcs	x20,x21,x16
    685 	mul	x16,x12,x28
    686 	adcs	x21,x22,x17
    687 	mul	x17,x13,x28
    688 	adcs	x22,x23,x14
    689 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
    690 	adcs	x23,x24,x15
    691 	umulh	x15,x7,x28
    692 	adcs	x24,x25,x16
    693 	umulh	x16,x8,x28
    694 	adcs	x25,x26,x17
    695 	umulh	x17,x9,x28
    696 	adc	x26,xzr,xzr
    697 	adds	x19,x19,x14
    698 	umulh	x14,x10,x28
    699 	adcs	x20,x20,x15
    700 	umulh	x15,x11,x28
    701 	adcs	x21,x21,x16
    702 	umulh	x16,x12,x28
    703 	adcs	x22,x22,x17
    704 	umulh	x17,x13,x28
    705 	mul	x28,x4,x19		// next t[0]*n0
    706 	adcs	x23,x23,x14
    707 	adcs	x24,x24,x15
    708 	adcs	x25,x25,x16
    709 	adc	x26,x26,x17
    710 	cbnz	x27,Lsqr8x_reduction
    711 
    712 	ldp	x14,x15,[x2,#8*0]
    713 	ldp	x16,x17,[x2,#8*2]
    714 	mov	x0,x2
    715 	sub	x27,x3,x1	// done yet?
    716 	adds	x19,x19,x14
    717 	adcs	x20,x20,x15
    718 	ldp	x14,x15,[x2,#8*4]
    719 	adcs	x21,x21,x16
    720 	adcs	x22,x22,x17
    721 	ldp	x16,x17,[x2,#8*6]
    722 	adcs	x23,x23,x14
    723 	adcs	x24,x24,x15
    724 	adcs	x25,x25,x16
    725 	adcs	x26,x26,x17
    726 	//adc	x28,xzr,xzr		// moved below
    727 	cbz	x27,Lsqr8x8_post_condition
    728 
    729 	ldr	x4,[x2,#-8*8]
    730 	ldp	x6,x7,[x1,#8*0]
    731 	ldp	x8,x9,[x1,#8*2]
    732 	ldp	x10,x11,[x1,#8*4]
    733 	mov	x27,#-8*8
    734 	ldp	x12,x13,[x1,#8*6]
    735 	add	x1,x1,#8*8
    736 
    737 Lsqr8x_tail:
    738 	mul	x14,x6,x4
    739 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    740 	mul	x15,x7,x4
    741 	add	x27,x27,#8
    742 	mul	x16,x8,x4
    743 	mul	x17,x9,x4
    744 	adds	x19,x19,x14
    745 	mul	x14,x10,x4
    746 	adcs	x20,x20,x15
    747 	mul	x15,x11,x4
    748 	adcs	x21,x21,x16
    749 	mul	x16,x12,x4
    750 	adcs	x22,x22,x17
    751 	mul	x17,x13,x4
    752 	adcs	x23,x23,x14
    753 	umulh	x14,x6,x4
    754 	adcs	x24,x24,x15
    755 	umulh	x15,x7,x4
    756 	adcs	x25,x25,x16
    757 	umulh	x16,x8,x4
    758 	adcs	x26,x26,x17
    759 	umulh	x17,x9,x4
    760 	adc	x28,x28,xzr
    761 	str	x19,[x2],#8
    762 	adds	x19,x20,x14
    763 	umulh	x14,x10,x4
    764 	adcs	x20,x21,x15
    765 	umulh	x15,x11,x4
    766 	adcs	x21,x22,x16
    767 	umulh	x16,x12,x4
    768 	adcs	x22,x23,x17
    769 	umulh	x17,x13,x4
    770 	ldr	x4,[x0,x27]
    771 	adcs	x23,x24,x14
    772 	adcs	x24,x25,x15
    773 	adcs	x25,x26,x16
    774 	adcs	x26,x28,x17
    775 	//adc	x28,xzr,xzr		// moved above
    776 	cbnz	x27,Lsqr8x_tail
    777 					// note that carry flag is guaranteed
    778 					// to be zero at this point
    779 	ldp	x6,x7,[x2,#8*0]
    780 	sub	x27,x3,x1	// done yet?
    781 	sub	x16,x3,x5	// rewinded np
    782 	ldp	x8,x9,[x2,#8*2]
    783 	ldp	x10,x11,[x2,#8*4]
    784 	ldp	x12,x13,[x2,#8*6]
    785 	cbz	x27,Lsqr8x_tail_break
    786 
    787 	ldr	x4,[x0,#-8*8]
    788 	adds	x19,x19,x6
    789 	adcs	x20,x20,x7
    790 	ldp	x6,x7,[x1,#8*0]
    791 	adcs	x21,x21,x8
    792 	adcs	x22,x22,x9
    793 	ldp	x8,x9,[x1,#8*2]
    794 	adcs	x23,x23,x10
    795 	adcs	x24,x24,x11
    796 	ldp	x10,x11,[x1,#8*4]
    797 	adcs	x25,x25,x12
    798 	mov	x27,#-8*8
    799 	adcs	x26,x26,x13
    800 	ldp	x12,x13,[x1,#8*6]
    801 	add	x1,x1,#8*8
    802 	//adc	x28,xzr,xzr		// moved above
    803 	b	Lsqr8x_tail
    804 
    805 .align	4
    806 Lsqr8x_tail_break:
    807 	ldr	x4,[x29,#112]		// pull n0
    808 	add	x27,x2,#8*8		// end of current t[num] window
    809 
    810 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
    811 	adcs	x14,x19,x6
    812 	adcs	x15,x20,x7
    813 	ldp	x19,x20,[x0,#8*0]
    814 	adcs	x21,x21,x8
    815 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
    816 	adcs	x22,x22,x9
    817 	ldp	x8,x9,[x16,#8*2]
    818 	adcs	x23,x23,x10
    819 	adcs	x24,x24,x11
    820 	ldp	x10,x11,[x16,#8*4]
    821 	adcs	x25,x25,x12
    822 	adcs	x26,x26,x13
    823 	ldp	x12,x13,[x16,#8*6]
    824 	add	x1,x16,#8*8
    825 	adc	x30,xzr,xzr	// top-most carry
    826 	mul	x28,x4,x19
    827 	stp	x14,x15,[x2,#8*0]
    828 	stp	x21,x22,[x2,#8*2]
    829 	ldp	x21,x22,[x0,#8*2]
    830 	stp	x23,x24,[x2,#8*4]
    831 	ldp	x23,x24,[x0,#8*4]
    832 	cmp	x27,x29		// did we hit the bottom?
    833 	stp	x25,x26,[x2,#8*6]
    834 	mov	x2,x0			// slide the window
    835 	ldp	x25,x26,[x0,#8*6]
    836 	mov	x27,#8
    837 	b.ne	Lsqr8x_reduction
    838 
    839 	// Final step. We see if result is larger than modulus, and
    840 	// if it is, subtract the modulus. But comparison implies
    841 	// subtraction. So we subtract modulus, see if it borrowed,
    842 	// and conditionally copy original value.
    843 	ldr	x0,[x29,#96]		// pull rp
    844 	add	x2,x2,#8*8
    845 	subs	x14,x19,x6
    846 	sbcs	x15,x20,x7
    847 	sub	x27,x5,#8*8
    848 	mov	x3,x0		// x0 copy
    849 
    850 Lsqr8x_sub:
    851 	sbcs	x16,x21,x8
    852 	ldp	x6,x7,[x1,#8*0]
    853 	sbcs	x17,x22,x9
    854 	stp	x14,x15,[x0,#8*0]
    855 	sbcs	x14,x23,x10
    856 	ldp	x8,x9,[x1,#8*2]
    857 	sbcs	x15,x24,x11
    858 	stp	x16,x17,[x0,#8*2]
    859 	sbcs	x16,x25,x12
    860 	ldp	x10,x11,[x1,#8*4]
    861 	sbcs	x17,x26,x13
    862 	ldp	x12,x13,[x1,#8*6]
    863 	add	x1,x1,#8*8
    864 	ldp	x19,x20,[x2,#8*0]
    865 	sub	x27,x27,#8*8
    866 	ldp	x21,x22,[x2,#8*2]
    867 	ldp	x23,x24,[x2,#8*4]
    868 	ldp	x25,x26,[x2,#8*6]
    869 	add	x2,x2,#8*8
    870 	stp	x14,x15,[x0,#8*4]
    871 	sbcs	x14,x19,x6
    872 	stp	x16,x17,[x0,#8*6]
    873 	add	x0,x0,#8*8
    874 	sbcs	x15,x20,x7
    875 	cbnz	x27,Lsqr8x_sub
    876 
    877 	sbcs	x16,x21,x8
    878 	mov	x2,sp
    879 	add	x1,sp,x5
    880 	ldp	x6,x7,[x3,#8*0]
    881 	sbcs	x17,x22,x9
    882 	stp	x14,x15,[x0,#8*0]
    883 	sbcs	x14,x23,x10
    884 	ldp	x8,x9,[x3,#8*2]
    885 	sbcs	x15,x24,x11
    886 	stp	x16,x17,[x0,#8*2]
    887 	sbcs	x16,x25,x12
    888 	ldp	x19,x20,[x1,#8*0]
    889 	sbcs	x17,x26,x13
    890 	ldp	x21,x22,[x1,#8*2]
    891 	sbcs	xzr,x30,xzr	// did it borrow?
    892 	ldr	x30,[x29,#8]		// pull return address
    893 	stp	x14,x15,[x0,#8*4]
    894 	stp	x16,x17,[x0,#8*6]
    895 
    896 	sub	x27,x5,#8*4
    897 Lsqr4x_cond_copy:
    898 	sub	x27,x27,#8*4
    899 	csel	x14,x19,x6,lo
    900 	stp	xzr,xzr,[x2,#8*0]
    901 	csel	x15,x20,x7,lo
    902 	ldp	x6,x7,[x3,#8*4]
    903 	ldp	x19,x20,[x1,#8*4]
    904 	csel	x16,x21,x8,lo
    905 	stp	xzr,xzr,[x2,#8*2]
    906 	add	x2,x2,#8*4
    907 	csel	x17,x22,x9,lo
    908 	ldp	x8,x9,[x3,#8*6]
    909 	ldp	x21,x22,[x1,#8*6]
    910 	add	x1,x1,#8*4
    911 	stp	x14,x15,[x3,#8*0]
    912 	stp	x16,x17,[x3,#8*2]
    913 	add	x3,x3,#8*4
    914 	stp	xzr,xzr,[x1,#8*0]
    915 	stp	xzr,xzr,[x1,#8*2]
    916 	cbnz	x27,Lsqr4x_cond_copy
    917 
    918 	csel	x14,x19,x6,lo
    919 	stp	xzr,xzr,[x2,#8*0]
    920 	csel	x15,x20,x7,lo
    921 	stp	xzr,xzr,[x2,#8*2]
    922 	csel	x16,x21,x8,lo
    923 	csel	x17,x22,x9,lo
    924 	stp	x14,x15,[x3,#8*0]
    925 	stp	x16,x17,[x3,#8*2]
    926 
    927 	b	Lsqr8x_done
    928 
    929 .align	4
    930 Lsqr8x8_post_condition:
    931 	adc	x28,xzr,xzr
    932 	ldr	x30,[x29,#8]		// pull return address
    933 	// x19-7,x28 hold result, x6-7 hold modulus
    934 	subs	x6,x19,x6
    935 	ldr	x1,[x29,#96]		// pull rp
    936 	sbcs	x7,x20,x7
    937 	stp	xzr,xzr,[sp,#8*0]
    938 	sbcs	x8,x21,x8
    939 	stp	xzr,xzr,[sp,#8*2]
    940 	sbcs	x9,x22,x9
    941 	stp	xzr,xzr,[sp,#8*4]
    942 	sbcs	x10,x23,x10
    943 	stp	xzr,xzr,[sp,#8*6]
    944 	sbcs	x11,x24,x11
    945 	stp	xzr,xzr,[sp,#8*8]
    946 	sbcs	x12,x25,x12
    947 	stp	xzr,xzr,[sp,#8*10]
    948 	sbcs	x13,x26,x13
    949 	stp	xzr,xzr,[sp,#8*12]
    950 	sbcs	x28,x28,xzr	// did it borrow?
    951 	stp	xzr,xzr,[sp,#8*14]
    952 
    953 	// x6-7 hold result-modulus
    954 	csel	x6,x19,x6,lo
    955 	csel	x7,x20,x7,lo
    956 	csel	x8,x21,x8,lo
    957 	csel	x9,x22,x9,lo
    958 	stp	x6,x7,[x1,#8*0]
    959 	csel	x10,x23,x10,lo
    960 	csel	x11,x24,x11,lo
    961 	stp	x8,x9,[x1,#8*2]
    962 	csel	x12,x25,x12,lo
    963 	csel	x13,x26,x13,lo
    964 	stp	x10,x11,[x1,#8*4]
    965 	stp	x12,x13,[x1,#8*6]
    966 
    967 Lsqr8x_done:
    968 	ldp	x19,x20,[x29,#16]
    969 	mov	sp,x29
    970 	ldp	x21,x22,[x29,#32]
    971 	mov	x0,#1
    972 	ldp	x23,x24,[x29,#48]
    973 	ldp	x25,x26,[x29,#64]
    974 	ldp	x27,x28,[x29,#80]
    975 	ldr	x29,[sp],#128
    976 	ret
    977 
    978 
    979 .align	5
    980 __bn_mul4x_mont:
    981 	stp	x29,x30,[sp,#-128]!
    982 	add	x29,sp,#0
    983 	stp	x19,x20,[sp,#16]
    984 	stp	x21,x22,[sp,#32]
    985 	stp	x23,x24,[sp,#48]
    986 	stp	x25,x26,[sp,#64]
    987 	stp	x27,x28,[sp,#80]
    988 
    989 	sub	x26,sp,x5,lsl#3
    990 	lsl	x5,x5,#3
    991 	ldr	x4,[x4]		// *n0
    992 	sub	sp,x26,#8*4		// alloca
    993 
    994 	add	x10,x2,x5
    995 	add	x27,x1,x5
    996 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
    997 
    998 	ldr	x24,[x2,#8*0]		// b[0]
    999 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1000 	ldp	x8,x9,[x1,#8*2]
   1001 	add	x1,x1,#8*4
   1002 	mov	x19,xzr
   1003 	mov	x20,xzr
   1004 	mov	x21,xzr
   1005 	mov	x22,xzr
   1006 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1007 	ldp	x16,x17,[x3,#8*2]
   1008 	adds	x3,x3,#8*4		// clear carry bit
   1009 	mov	x0,xzr
   1010 	mov	x28,#0
   1011 	mov	x26,sp
   1012 
   1013 Loop_mul4x_1st_reduction:
   1014 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
   1015 	adc	x0,x0,xzr	// modulo-scheduled
   1016 	mul	x11,x7,x24
   1017 	add	x28,x28,#8
   1018 	mul	x12,x8,x24
   1019 	and	x28,x28,#31
   1020 	mul	x13,x9,x24
   1021 	adds	x19,x19,x10
   1022 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
   1023 	adcs	x20,x20,x11
   1024 	mul	x25,x19,x4		// t[0]*n0
   1025 	adcs	x21,x21,x12
   1026 	umulh	x11,x7,x24
   1027 	adcs	x22,x22,x13
   1028 	umulh	x12,x8,x24
   1029 	adc	x23,xzr,xzr
   1030 	umulh	x13,x9,x24
   1031 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1032 	adds	x20,x20,x10
   1033 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
   1034 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1035 	adcs	x21,x21,x11
   1036 	mul	x11,x15,x25
   1037 	adcs	x22,x22,x12
   1038 	mul	x12,x16,x25
   1039 	adc	x23,x23,x13		// can't overflow
   1040 	mul	x13,x17,x25
   1041 	// (*)	adds	xzr,x19,x10
   1042 	subs	xzr,x19,#1		// (*)
   1043 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
   1044 	adcs	x19,x20,x11
   1045 	umulh	x11,x15,x25
   1046 	adcs	x20,x21,x12
   1047 	umulh	x12,x16,x25
   1048 	adcs	x21,x22,x13
   1049 	umulh	x13,x17,x25
   1050 	adcs	x22,x23,x0
   1051 	adc	x0,xzr,xzr
   1052 	adds	x19,x19,x10
   1053 	sub	x10,x27,x1
   1054 	adcs	x20,x20,x11
   1055 	adcs	x21,x21,x12
   1056 	adcs	x22,x22,x13
   1057 	//adc	x0,x0,xzr
   1058 	cbnz	x28,Loop_mul4x_1st_reduction
   1059 
   1060 	cbz	x10,Lmul4x4_post_condition
   1061 
   1062 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1063 	ldp	x8,x9,[x1,#8*2]
   1064 	add	x1,x1,#8*4
   1065 	ldr	x25,[sp]		// a[0]*n0
   1066 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1067 	ldp	x16,x17,[x3,#8*2]
   1068 	add	x3,x3,#8*4
   1069 
   1070 Loop_mul4x_1st_tail:
   1071 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
   1072 	adc	x0,x0,xzr	// modulo-scheduled
   1073 	mul	x11,x7,x24
   1074 	add	x28,x28,#8
   1075 	mul	x12,x8,x24
   1076 	and	x28,x28,#31
   1077 	mul	x13,x9,x24
   1078 	adds	x19,x19,x10
   1079 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
   1080 	adcs	x20,x20,x11
   1081 	umulh	x11,x7,x24
   1082 	adcs	x21,x21,x12
   1083 	umulh	x12,x8,x24
   1084 	adcs	x22,x22,x13
   1085 	umulh	x13,x9,x24
   1086 	adc	x23,xzr,xzr
   1087 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1088 	adds	x20,x20,x10
   1089 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
   1090 	adcs	x21,x21,x11
   1091 	mul	x11,x15,x25
   1092 	adcs	x22,x22,x12
   1093 	mul	x12,x16,x25
   1094 	adc	x23,x23,x13		// can't overflow
   1095 	mul	x13,x17,x25
   1096 	adds	x19,x19,x10
   1097 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
   1098 	adcs	x20,x20,x11
   1099 	umulh	x11,x15,x25
   1100 	adcs	x21,x21,x12
   1101 	umulh	x12,x16,x25
   1102 	adcs	x22,x22,x13
   1103 	adcs	x23,x23,x0
   1104 	umulh	x13,x17,x25
   1105 	adc	x0,xzr,xzr
   1106 	ldr	x25,[sp,x28]		// next t[0]*n0
   1107 	str	x19,[x26],#8		// result!!!
   1108 	adds	x19,x20,x10
   1109 	sub	x10,x27,x1		// done yet?
   1110 	adcs	x20,x21,x11
   1111 	adcs	x21,x22,x12
   1112 	adcs	x22,x23,x13
   1113 	//adc	x0,x0,xzr
   1114 	cbnz	x28,Loop_mul4x_1st_tail
   1115 
   1116 	sub	x11,x27,x5	// rewinded x1
   1117 	cbz	x10,Lmul4x_proceed
   1118 
   1119 	ldp	x6,x7,[x1,#8*0]
   1120 	ldp	x8,x9,[x1,#8*2]
   1121 	add	x1,x1,#8*4
   1122 	ldp	x14,x15,[x3,#8*0]
   1123 	ldp	x16,x17,[x3,#8*2]
   1124 	add	x3,x3,#8*4
   1125 	b	Loop_mul4x_1st_tail
   1126 
   1127 .align	5
   1128 Lmul4x_proceed:
   1129 	ldr	x24,[x2,#8*4]!		// *++b
   1130 	adc	x30,x0,xzr
   1131 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
   1132 	sub	x3,x3,x5		// rewind np
   1133 	ldp	x8,x9,[x11,#8*2]
   1134 	add	x1,x11,#8*4
   1135 
   1136 	stp	x19,x20,[x26,#8*0]	// result!!!
   1137 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1138 	stp	x21,x22,[x26,#8*2]	// result!!!
   1139 	ldp	x21,x22,[sp,#8*6]
   1140 
   1141 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1142 	mov	x26,sp
   1143 	ldp	x16,x17,[x3,#8*2]
   1144 	adds	x3,x3,#8*4		// clear carry bit
   1145 	mov	x0,xzr
   1146 
   1147 .align	4
   1148 Loop_mul4x_reduction:
   1149 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
   1150 	adc	x0,x0,xzr	// modulo-scheduled
   1151 	mul	x11,x7,x24
   1152 	add	x28,x28,#8
   1153 	mul	x12,x8,x24
   1154 	and	x28,x28,#31
   1155 	mul	x13,x9,x24
   1156 	adds	x19,x19,x10
   1157 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
   1158 	adcs	x20,x20,x11
   1159 	mul	x25,x19,x4		// t[0]*n0
   1160 	adcs	x21,x21,x12
   1161 	umulh	x11,x7,x24
   1162 	adcs	x22,x22,x13
   1163 	umulh	x12,x8,x24
   1164 	adc	x23,xzr,xzr
   1165 	umulh	x13,x9,x24
   1166 	ldr	x24,[x2,x28]		// next b[i]
   1167 	adds	x20,x20,x10
   1168 	// (*)	mul	x10,x14,x25
   1169 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1170 	adcs	x21,x21,x11
   1171 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
   1172 	adcs	x22,x22,x12
   1173 	mul	x12,x16,x25
   1174 	adc	x23,x23,x13		// can't overflow
   1175 	mul	x13,x17,x25
   1176 	// (*)	adds	xzr,x19,x10
   1177 	subs	xzr,x19,#1		// (*)
   1178 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
   1179 	adcs	x19,x20,x11
   1180 	umulh	x11,x15,x25
   1181 	adcs	x20,x21,x12
   1182 	umulh	x12,x16,x25
   1183 	adcs	x21,x22,x13
   1184 	umulh	x13,x17,x25
   1185 	adcs	x22,x23,x0
   1186 	adc	x0,xzr,xzr
   1187 	adds	x19,x19,x10
   1188 	adcs	x20,x20,x11
   1189 	adcs	x21,x21,x12
   1190 	adcs	x22,x22,x13
   1191 	//adc	x0,x0,xzr
   1192 	cbnz	x28,Loop_mul4x_reduction
   1193 
   1194 	adc	x0,x0,xzr
   1195 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
   1196 	ldp	x12,x13,[x26,#8*6]
   1197 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1198 	ldp	x8,x9,[x1,#8*2]
   1199 	add	x1,x1,#8*4
   1200 	adds	x19,x19,x10
   1201 	adcs	x20,x20,x11
   1202 	adcs	x21,x21,x12
   1203 	adcs	x22,x22,x13
   1204 	//adc	x0,x0,xzr
   1205 
   1206 	ldr	x25,[sp]		// t[0]*n0
   1207 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1208 	ldp	x16,x17,[x3,#8*2]
   1209 	add	x3,x3,#8*4
   1210 
   1211 .align	4
   1212 Loop_mul4x_tail:
   1213 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
   1214 	adc	x0,x0,xzr	// modulo-scheduled
   1215 	mul	x11,x7,x24
   1216 	add	x28,x28,#8
   1217 	mul	x12,x8,x24
   1218 	and	x28,x28,#31
   1219 	mul	x13,x9,x24
   1220 	adds	x19,x19,x10
   1221 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
   1222 	adcs	x20,x20,x11
   1223 	umulh	x11,x7,x24
   1224 	adcs	x21,x21,x12
   1225 	umulh	x12,x8,x24
   1226 	adcs	x22,x22,x13
   1227 	umulh	x13,x9,x24
   1228 	adc	x23,xzr,xzr
   1229 	ldr	x24,[x2,x28]		// next b[i]
   1230 	adds	x20,x20,x10
   1231 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
   1232 	adcs	x21,x21,x11
   1233 	mul	x11,x15,x25
   1234 	adcs	x22,x22,x12
   1235 	mul	x12,x16,x25
   1236 	adc	x23,x23,x13		// can't overflow
   1237 	mul	x13,x17,x25
   1238 	adds	x19,x19,x10
   1239 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
   1240 	adcs	x20,x20,x11
   1241 	umulh	x11,x15,x25
   1242 	adcs	x21,x21,x12
   1243 	umulh	x12,x16,x25
   1244 	adcs	x22,x22,x13
   1245 	umulh	x13,x17,x25
   1246 	adcs	x23,x23,x0
   1247 	ldr	x25,[sp,x28]		// next a[0]*n0
   1248 	adc	x0,xzr,xzr
   1249 	str	x19,[x26],#8		// result!!!
   1250 	adds	x19,x20,x10
   1251 	sub	x10,x27,x1		// done yet?
   1252 	adcs	x20,x21,x11
   1253 	adcs	x21,x22,x12
   1254 	adcs	x22,x23,x13
   1255 	//adc	x0,x0,xzr
   1256 	cbnz	x28,Loop_mul4x_tail
   1257 
   1258 	sub	x11,x3,x5		// rewinded np?
   1259 	adc	x0,x0,xzr
   1260 	cbz	x10,Loop_mul4x_break
   1261 
   1262 	ldp	x10,x11,[x26,#8*4]
   1263 	ldp	x12,x13,[x26,#8*6]
   1264 	ldp	x6,x7,[x1,#8*0]
   1265 	ldp	x8,x9,[x1,#8*2]
   1266 	add	x1,x1,#8*4
   1267 	adds	x19,x19,x10
   1268 	adcs	x20,x20,x11
   1269 	adcs	x21,x21,x12
   1270 	adcs	x22,x22,x13
   1271 	//adc	x0,x0,xzr
   1272 	ldp	x14,x15,[x3,#8*0]
   1273 	ldp	x16,x17,[x3,#8*2]
   1274 	add	x3,x3,#8*4
   1275 	b	Loop_mul4x_tail
   1276 
   1277 .align	4
   1278 Loop_mul4x_break:
   1279 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
   1280 	adds	x19,x19,x30
   1281 	add	x2,x2,#8*4		// bp++
   1282 	adcs	x20,x20,xzr
   1283 	sub	x1,x1,x5		// rewind ap
   1284 	adcs	x21,x21,xzr
   1285 	stp	x19,x20,[x26,#8*0]	// result!!!
   1286 	adcs	x22,x22,xzr
   1287 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1288 	adc	x30,x0,xzr
   1289 	stp	x21,x22,[x26,#8*2]	// result!!!
   1290 	cmp	x2,x13			// done yet?
   1291 	ldp	x21,x22,[sp,#8*6]
   1292 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
   1293 	ldp	x16,x17,[x11,#8*2]
   1294 	add	x3,x11,#8*4
   1295 	b.eq	Lmul4x_post
   1296 
   1297 	ldr	x24,[x2]
   1298 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1299 	ldp	x8,x9,[x1,#8*2]
   1300 	adds	x1,x1,#8*4		// clear carry bit
   1301 	mov	x0,xzr
   1302 	mov	x26,sp
   1303 	b	Loop_mul4x_reduction
   1304 
   1305 .align	4
   1306 Lmul4x_post:
   1307 	// Final step. We see if result is larger than modulus, and
   1308 	// if it is, subtract the modulus. But comparison implies
   1309 	// subtraction. So we subtract modulus, see if it borrowed,
   1310 	// and conditionally copy original value.
   1311 	mov	x0,x12
   1312 	mov	x27,x12		// x0 copy
   1313 	subs	x10,x19,x14
   1314 	add	x26,sp,#8*8
   1315 	sbcs	x11,x20,x15
   1316 	sub	x28,x5,#8*4
   1317 
   1318 Lmul4x_sub:
   1319 	sbcs	x12,x21,x16
   1320 	ldp	x14,x15,[x3,#8*0]
   1321 	sub	x28,x28,#8*4
   1322 	ldp	x19,x20,[x26,#8*0]
   1323 	sbcs	x13,x22,x17
   1324 	ldp	x16,x17,[x3,#8*2]
   1325 	add	x3,x3,#8*4
   1326 	ldp	x21,x22,[x26,#8*2]
   1327 	add	x26,x26,#8*4
   1328 	stp	x10,x11,[x0,#8*0]
   1329 	sbcs	x10,x19,x14
   1330 	stp	x12,x13,[x0,#8*2]
   1331 	add	x0,x0,#8*4
   1332 	sbcs	x11,x20,x15
   1333 	cbnz	x28,Lmul4x_sub
   1334 
   1335 	sbcs	x12,x21,x16
   1336 	mov	x26,sp
   1337 	add	x1,sp,#8*4
   1338 	ldp	x6,x7,[x27,#8*0]
   1339 	sbcs	x13,x22,x17
   1340 	stp	x10,x11,[x0,#8*0]
   1341 	ldp	x8,x9,[x27,#8*2]
   1342 	stp	x12,x13,[x0,#8*2]
   1343 	ldp	x19,x20,[x1,#8*0]
   1344 	ldp	x21,x22,[x1,#8*2]
   1345 	sbcs	xzr,x30,xzr	// did it borrow?
   1346 	ldr	x30,[x29,#8]		// pull return address
   1347 
   1348 	sub	x28,x5,#8*4
   1349 Lmul4x_cond_copy:
   1350 	sub	x28,x28,#8*4
   1351 	csel	x10,x19,x6,lo
   1352 	stp	xzr,xzr,[x26,#8*0]
   1353 	csel	x11,x20,x7,lo
   1354 	ldp	x6,x7,[x27,#8*4]
   1355 	ldp	x19,x20,[x1,#8*4]
   1356 	csel	x12,x21,x8,lo
   1357 	stp	xzr,xzr,[x26,#8*2]
   1358 	add	x26,x26,#8*4
   1359 	csel	x13,x22,x9,lo
   1360 	ldp	x8,x9,[x27,#8*6]
   1361 	ldp	x21,x22,[x1,#8*6]
   1362 	add	x1,x1,#8*4
   1363 	stp	x10,x11,[x27,#8*0]
   1364 	stp	x12,x13,[x27,#8*2]
   1365 	add	x27,x27,#8*4
   1366 	cbnz	x28,Lmul4x_cond_copy
   1367 
   1368 	csel	x10,x19,x6,lo
   1369 	stp	xzr,xzr,[x26,#8*0]
   1370 	csel	x11,x20,x7,lo
   1371 	stp	xzr,xzr,[x26,#8*2]
   1372 	csel	x12,x21,x8,lo
   1373 	stp	xzr,xzr,[x26,#8*3]
   1374 	csel	x13,x22,x9,lo
   1375 	stp	xzr,xzr,[x26,#8*4]
   1376 	stp	x10,x11,[x27,#8*0]
   1377 	stp	x12,x13,[x27,#8*2]
   1378 
   1379 	b	Lmul4x_done
   1380 
   1381 .align	4
   1382 Lmul4x4_post_condition:
   1383 	adc	x0,x0,xzr
   1384 	ldr	x1,[x29,#96]		// pull rp
   1385 	// x19-3,x0 hold result, x14-7 hold modulus
   1386 	subs	x6,x19,x14
   1387 	ldr	x30,[x29,#8]		// pull return address
   1388 	sbcs	x7,x20,x15
   1389 	stp	xzr,xzr,[sp,#8*0]
   1390 	sbcs	x8,x21,x16
   1391 	stp	xzr,xzr,[sp,#8*2]
   1392 	sbcs	x9,x22,x17
   1393 	stp	xzr,xzr,[sp,#8*4]
   1394 	sbcs	xzr,x0,xzr		// did it borrow?
   1395 	stp	xzr,xzr,[sp,#8*6]
   1396 
   1397 	// x6-3 hold result-modulus
   1398 	csel	x6,x19,x6,lo
   1399 	csel	x7,x20,x7,lo
   1400 	csel	x8,x21,x8,lo
   1401 	csel	x9,x22,x9,lo
   1402 	stp	x6,x7,[x1,#8*0]
   1403 	stp	x8,x9,[x1,#8*2]
   1404 
   1405 Lmul4x_done:
   1406 	ldp	x19,x20,[x29,#16]
   1407 	mov	sp,x29
   1408 	ldp	x21,x22,[x29,#32]
   1409 	mov	x0,#1
   1410 	ldp	x23,x24,[x29,#48]
   1411 	ldp	x25,x26,[x29,#64]
   1412 	ldp	x27,x28,[x29,#80]
   1413 	ldr	x29,[sp],#128
   1414 	ret
   1415 
   1416 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1417 .align	2
   1418 .align	4
   1419 #endif  // !OPENSSL_NO_ASM
   1420