Home | History | Annotate | Download | only in fipsmodule
      1 #if defined(__aarch64__)
      2 .text
      3 
      4 .globl	bn_mul_mont
      5 .hidden	bn_mul_mont
      6 .type	bn_mul_mont,%function
      7 .align	5
      8 bn_mul_mont:
      9 	tst	x5,#7
     10 	b.eq	__bn_sqr8x_mont
     11 	tst	x5,#3
     12 	b.eq	__bn_mul4x_mont
     13 .Lmul_mont:
     14 	stp	x29,x30,[sp,#-64]!
     15 	add	x29,sp,#0
     16 	stp	x19,x20,[sp,#16]
     17 	stp	x21,x22,[sp,#32]
     18 	stp	x23,x24,[sp,#48]
     19 
     20 	ldr	x9,[x2],#8		// bp[0]
     21 	sub	x22,sp,x5,lsl#3
     22 	ldp	x7,x8,[x1],#16	// ap[0..1]
     23 	lsl	x5,x5,#3
     24 	ldr	x4,[x4]		// *n0
     25 	and	x22,x22,#-16		// ABI says so
     26 	ldp	x13,x14,[x3],#16	// np[0..1]
     27 
     28 	mul	x6,x7,x9		// ap[0]*bp[0]
     29 	sub	x21,x5,#16		// j=num-2
     30 	umulh	x7,x7,x9
     31 	mul	x10,x8,x9		// ap[1]*bp[0]
     32 	umulh	x11,x8,x9
     33 
     34 	mul	x15,x6,x4		// "tp[0]"*n0
     35 	mov	sp,x22			// alloca
     36 
     37 	// (*)	mul	x12,x13,x15	// np[0]*m1
     38 	umulh	x13,x13,x15
     39 	mul	x16,x14,x15		// np[1]*m1
     40 	// (*)	adds	x12,x12,x6	// discarded
     41 	// (*)	As for removal of first multiplication and addition
     42 	//	instructions. The outcome of first addition is
     43 	//	guaranteed to be zero, which leaves two computationally
     44 	//	significant outcomes: it either carries or not. Then
     45 	//	question is when does it carry? Is there alternative
     46 	//	way to deduce it? If you follow operations, you can
     47 	//	observe that condition for carry is quite simple:
     48 	//	x6 being non-zero. So that carry can be calculated
     49 	//	by adding -1 to x6. That's what next instruction does.
     50 	subs	xzr,x6,#1		// (*)
     51 	umulh	x17,x14,x15
     52 	adc	x13,x13,xzr
     53 	cbz	x21,.L1st_skip
     54 
     55 .L1st:
     56 	ldr	x8,[x1],#8
     57 	adds	x6,x10,x7
     58 	sub	x21,x21,#8		// j--
     59 	adc	x7,x11,xzr
     60 
     61 	ldr	x14,[x3],#8
     62 	adds	x12,x16,x13
     63 	mul	x10,x8,x9		// ap[j]*bp[0]
     64 	adc	x13,x17,xzr
     65 	umulh	x11,x8,x9
     66 
     67 	adds	x12,x12,x6
     68 	mul	x16,x14,x15		// np[j]*m1
     69 	adc	x13,x13,xzr
     70 	umulh	x17,x14,x15
     71 	str	x12,[x22],#8		// tp[j-1]
     72 	cbnz	x21,.L1st
     73 
     74 .L1st_skip:
     75 	adds	x6,x10,x7
     76 	sub	x1,x1,x5		// rewind x1
     77 	adc	x7,x11,xzr
     78 
     79 	adds	x12,x16,x13
     80 	sub	x3,x3,x5		// rewind x3
     81 	adc	x13,x17,xzr
     82 
     83 	adds	x12,x12,x6
     84 	sub	x20,x5,#8		// i=num-1
     85 	adcs	x13,x13,x7
     86 
     87 	adc	x19,xzr,xzr		// upmost overflow bit
     88 	stp	x12,x13,[x22]
     89 
     90 .Louter:
     91 	ldr	x9,[x2],#8		// bp[i]
     92 	ldp	x7,x8,[x1],#16
     93 	ldr	x23,[sp]		// tp[0]
     94 	add	x22,sp,#8
     95 
     96 	mul	x6,x7,x9		// ap[0]*bp[i]
     97 	sub	x21,x5,#16		// j=num-2
     98 	umulh	x7,x7,x9
     99 	ldp	x13,x14,[x3],#16
    100 	mul	x10,x8,x9		// ap[1]*bp[i]
    101 	adds	x6,x6,x23
    102 	umulh	x11,x8,x9
    103 	adc	x7,x7,xzr
    104 
    105 	mul	x15,x6,x4
    106 	sub	x20,x20,#8		// i--
    107 
    108 	// (*)	mul	x12,x13,x15	// np[0]*m1
    109 	umulh	x13,x13,x15
    110 	mul	x16,x14,x15		// np[1]*m1
    111 	// (*)	adds	x12,x12,x6
    112 	subs	xzr,x6,#1		// (*)
    113 	umulh	x17,x14,x15
    114 	cbz	x21,.Linner_skip
    115 
    116 .Linner:
    117 	ldr	x8,[x1],#8
    118 	adc	x13,x13,xzr
    119 	ldr	x23,[x22],#8		// tp[j]
    120 	adds	x6,x10,x7
    121 	sub	x21,x21,#8		// j--
    122 	adc	x7,x11,xzr
    123 
    124 	adds	x12,x16,x13
    125 	ldr	x14,[x3],#8
    126 	adc	x13,x17,xzr
    127 
    128 	mul	x10,x8,x9		// ap[j]*bp[i]
    129 	adds	x6,x6,x23
    130 	umulh	x11,x8,x9
    131 	adc	x7,x7,xzr
    132 
    133 	mul	x16,x14,x15		// np[j]*m1
    134 	adds	x12,x12,x6
    135 	umulh	x17,x14,x15
    136 	str	x12,[x22,#-16]		// tp[j-1]
    137 	cbnz	x21,.Linner
    138 
    139 .Linner_skip:
    140 	ldr	x23,[x22],#8		// tp[j]
    141 	adc	x13,x13,xzr
    142 	adds	x6,x10,x7
    143 	sub	x1,x1,x5		// rewind x1
    144 	adc	x7,x11,xzr
    145 
    146 	adds	x12,x16,x13
    147 	sub	x3,x3,x5		// rewind x3
    148 	adcs	x13,x17,x19
    149 	adc	x19,xzr,xzr
    150 
    151 	adds	x6,x6,x23
    152 	adc	x7,x7,xzr
    153 
    154 	adds	x12,x12,x6
    155 	adcs	x13,x13,x7
    156 	adc	x19,x19,xzr		// upmost overflow bit
    157 	stp	x12,x13,[x22,#-16]
    158 
    159 	cbnz	x20,.Louter
    160 
    161 	// Final step. We see if result is larger than modulus, and
    162 	// if it is, subtract the modulus. But comparison implies
    163 	// subtraction. So we subtract modulus, see if it borrowed,
    164 	// and conditionally copy original value.
    165 	ldr	x23,[sp]		// tp[0]
    166 	add	x22,sp,#8
    167 	ldr	x14,[x3],#8		// np[0]
    168 	subs	x21,x5,#8		// j=num-1 and clear borrow
    169 	mov	x1,x0
    170 .Lsub:
    171 	sbcs	x8,x23,x14		// tp[j]-np[j]
    172 	ldr	x23,[x22],#8
    173 	sub	x21,x21,#8		// j--
    174 	ldr	x14,[x3],#8
    175 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
    176 	cbnz	x21,.Lsub
    177 
    178 	sbcs	x8,x23,x14
    179 	sbcs	x19,x19,xzr		// did it borrow?
    180 	str	x8,[x1],#8		// rp[num-1]
    181 
    182 	ldr	x23,[sp]		// tp[0]
    183 	add	x22,sp,#8
    184 	ldr	x8,[x0],#8		// rp[0]
    185 	sub	x5,x5,#8		// num--
    186 	nop
    187 .Lcond_copy:
    188 	sub	x5,x5,#8		// num--
    189 	csel	x14,x23,x8,lo		// did it borrow?
    190 	ldr	x23,[x22],#8
    191 	ldr	x8,[x0],#8
    192 	str	xzr,[x22,#-16]		// wipe tp
    193 	str	x14,[x0,#-16]
    194 	cbnz	x5,.Lcond_copy
    195 
    196 	csel	x14,x23,x8,lo
    197 	str	xzr,[x22,#-8]		// wipe tp
    198 	str	x14,[x0,#-8]
    199 
    200 	ldp	x19,x20,[x29,#16]
    201 	mov	sp,x29
    202 	ldp	x21,x22,[x29,#32]
    203 	mov	x0,#1
    204 	ldp	x23,x24,[x29,#48]
    205 	ldr	x29,[sp],#64
    206 	ret
    207 .size	bn_mul_mont,.-bn_mul_mont
    208 .type	__bn_sqr8x_mont,%function
    209 .align	5
    210 __bn_sqr8x_mont:
    211 	cmp	x1,x2
    212 	b.ne	__bn_mul4x_mont
    213 .Lsqr8x_mont:
    214 	stp	x29,x30,[sp,#-128]!
    215 	add	x29,sp,#0
    216 	stp	x19,x20,[sp,#16]
    217 	stp	x21,x22,[sp,#32]
    218 	stp	x23,x24,[sp,#48]
    219 	stp	x25,x26,[sp,#64]
    220 	stp	x27,x28,[sp,#80]
    221 	stp	x0,x3,[sp,#96]	// offload rp and np
    222 
    223 	ldp	x6,x7,[x1,#8*0]
    224 	ldp	x8,x9,[x1,#8*2]
    225 	ldp	x10,x11,[x1,#8*4]
    226 	ldp	x12,x13,[x1,#8*6]
    227 
    228 	sub	x2,sp,x5,lsl#4
    229 	lsl	x5,x5,#3
    230 	ldr	x4,[x4]		// *n0
    231 	mov	sp,x2			// alloca
    232 	sub	x27,x5,#8*8
    233 	b	.Lsqr8x_zero_start
    234 
    235 .Lsqr8x_zero:
    236 	sub	x27,x27,#8*8
    237 	stp	xzr,xzr,[x2,#8*0]
    238 	stp	xzr,xzr,[x2,#8*2]
    239 	stp	xzr,xzr,[x2,#8*4]
    240 	stp	xzr,xzr,[x2,#8*6]
    241 .Lsqr8x_zero_start:
    242 	stp	xzr,xzr,[x2,#8*8]
    243 	stp	xzr,xzr,[x2,#8*10]
    244 	stp	xzr,xzr,[x2,#8*12]
    245 	stp	xzr,xzr,[x2,#8*14]
    246 	add	x2,x2,#8*16
    247 	cbnz	x27,.Lsqr8x_zero
    248 
    249 	add	x3,x1,x5
    250 	add	x1,x1,#8*8
    251 	mov	x19,xzr
    252 	mov	x20,xzr
    253 	mov	x21,xzr
    254 	mov	x22,xzr
    255 	mov	x23,xzr
    256 	mov	x24,xzr
    257 	mov	x25,xzr
    258 	mov	x26,xzr
    259 	mov	x2,sp
    260 	str	x4,[x29,#112]		// offload n0
    261 
    262 	// Multiply everything but a[i]*a[i]
    263 .align	4
    264 .Lsqr8x_outer_loop:
    265         //                                                 a[1]a[0]	(i)
    266         //                                             a[2]a[0]
    267         //                                         a[3]a[0]
    268         //                                     a[4]a[0]
    269         //                                 a[5]a[0]
    270         //                             a[6]a[0]
    271         //                         a[7]a[0]
    272         //                                         a[2]a[1]		(ii)
    273         //                                     a[3]a[1]
    274         //                                 a[4]a[1]
    275         //                             a[5]a[1]
    276         //                         a[6]a[1]
    277         //                     a[7]a[1]
    278         //                                 a[3]a[2]			(iii)
    279         //                             a[4]a[2]
    280         //                         a[5]a[2]
    281         //                     a[6]a[2]
    282         //                 a[7]a[2]
    283         //                         a[4]a[3]				(iv)
    284         //                     a[5]a[3]
    285         //                 a[6]a[3]
    286         //             a[7]a[3]
    287         //                 a[5]a[4]					(v)
    288         //             a[6]a[4]
    289         //         a[7]a[4]
    290         //         a[6]a[5]						(vi)
    291         //     a[7]a[5]
    292         // a[7]a[6]							(vii)
    293 
    294 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
    295 	mul	x15,x8,x6
    296 	mul	x16,x9,x6
    297 	mul	x17,x10,x6
    298 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
    299 	mul	x14,x11,x6
    300 	adcs	x21,x21,x15
    301 	mul	x15,x12,x6
    302 	adcs	x22,x22,x16
    303 	mul	x16,x13,x6
    304 	adcs	x23,x23,x17
    305 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
    306 	adcs	x24,x24,x14
    307 	umulh	x14,x8,x6
    308 	adcs	x25,x25,x15
    309 	umulh	x15,x9,x6
    310 	adcs	x26,x26,x16
    311 	umulh	x16,x10,x6
    312 	stp	x19,x20,[x2],#8*2	// t[0..1]
    313 	adc	x19,xzr,xzr		// t[8]
    314 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
    315 	umulh	x17,x11,x6
    316 	adcs	x22,x22,x14
    317 	umulh	x14,x12,x6
    318 	adcs	x23,x23,x15
    319 	umulh	x15,x13,x6
    320 	adcs	x24,x24,x16
    321 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
    322 	adcs	x25,x25,x17
    323 	mul	x17,x9,x7
    324 	adcs	x26,x26,x14
    325 	mul	x14,x10,x7
    326 	adc	x19,x19,x15
    327 
    328 	mul	x15,x11,x7
    329 	adds	x22,x22,x16
    330 	mul	x16,x12,x7
    331 	adcs	x23,x23,x17
    332 	mul	x17,x13,x7
    333 	adcs	x24,x24,x14
    334 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
    335 	adcs	x25,x25,x15
    336 	umulh	x15,x9,x7
    337 	adcs	x26,x26,x16
    338 	umulh	x16,x10,x7
    339 	adcs	x19,x19,x17
    340 	umulh	x17,x11,x7
    341 	stp	x21,x22,[x2],#8*2	// t[2..3]
    342 	adc	x20,xzr,xzr		// t[9]
    343 	adds	x23,x23,x14
    344 	umulh	x14,x12,x7
    345 	adcs	x24,x24,x15
    346 	umulh	x15,x13,x7
    347 	adcs	x25,x25,x16
    348 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
    349 	adcs	x26,x26,x17
    350 	mul	x17,x10,x8
    351 	adcs	x19,x19,x14
    352 	mul	x14,x11,x8
    353 	adc	x20,x20,x15
    354 
    355 	mul	x15,x12,x8
    356 	adds	x24,x24,x16
    357 	mul	x16,x13,x8
    358 	adcs	x25,x25,x17
    359 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
    360 	adcs	x26,x26,x14
    361 	umulh	x14,x10,x8
    362 	adcs	x19,x19,x15
    363 	umulh	x15,x11,x8
    364 	adcs	x20,x20,x16
    365 	umulh	x16,x12,x8
    366 	stp	x23,x24,[x2],#8*2	// t[4..5]
    367 	adc	x21,xzr,xzr		// t[10]
    368 	adds	x25,x25,x17
    369 	umulh	x17,x13,x8
    370 	adcs	x26,x26,x14
    371 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
    372 	adcs	x19,x19,x15
    373 	mul	x15,x11,x9
    374 	adcs	x20,x20,x16
    375 	mul	x16,x12,x9
    376 	adc	x21,x21,x17
    377 
    378 	mul	x17,x13,x9
    379 	adds	x26,x26,x14
    380 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
    381 	adcs	x19,x19,x15
    382 	umulh	x15,x11,x9
    383 	adcs	x20,x20,x16
    384 	umulh	x16,x12,x9
    385 	adcs	x21,x21,x17
    386 	umulh	x17,x13,x9
    387 	stp	x25,x26,[x2],#8*2	// t[6..7]
    388 	adc	x22,xzr,xzr		// t[11]
    389 	adds	x19,x19,x14
    390 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
    391 	adcs	x20,x20,x15
    392 	mul	x15,x12,x10
    393 	adcs	x21,x21,x16
    394 	mul	x16,x13,x10
    395 	adc	x22,x22,x17
    396 
    397 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
    398 	adds	x20,x20,x14
    399 	umulh	x14,x12,x10
    400 	adcs	x21,x21,x15
    401 	umulh	x15,x13,x10
    402 	adcs	x22,x22,x16
    403 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
    404 	adc	x23,xzr,xzr		// t[12]
    405 	adds	x21,x21,x17
    406 	mul	x17,x13,x11
    407 	adcs	x22,x22,x14
    408 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
    409 	adc	x23,x23,x15
    410 
    411 	umulh	x15,x13,x11
    412 	adds	x22,x22,x16
    413 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
    414 	adcs	x23,x23,x17
    415 	umulh	x17,x13,x12		// hi(a[7]*a[6])
    416 	adc	x24,xzr,xzr		// t[13]
    417 	adds	x23,x23,x14
    418 	sub	x27,x3,x1	// done yet?
    419 	adc	x24,x24,x15
    420 
    421 	adds	x24,x24,x16
    422 	sub	x14,x3,x5	// rewinded ap
    423 	adc	x25,xzr,xzr		// t[14]
    424 	add	x25,x25,x17
    425 
    426 	cbz	x27,.Lsqr8x_outer_break
    427 
    428 	mov	x4,x6
    429 	ldp	x6,x7,[x2,#8*0]
    430 	ldp	x8,x9,[x2,#8*2]
    431 	ldp	x10,x11,[x2,#8*4]
    432 	ldp	x12,x13,[x2,#8*6]
    433 	adds	x19,x19,x6
    434 	adcs	x20,x20,x7
    435 	ldp	x6,x7,[x1,#8*0]
    436 	adcs	x21,x21,x8
    437 	adcs	x22,x22,x9
    438 	ldp	x8,x9,[x1,#8*2]
    439 	adcs	x23,x23,x10
    440 	adcs	x24,x24,x11
    441 	ldp	x10,x11,[x1,#8*4]
    442 	adcs	x25,x25,x12
    443 	mov	x0,x1
    444 	adcs	x26,xzr,x13
    445 	ldp	x12,x13,[x1,#8*6]
    446 	add	x1,x1,#8*8
    447 	//adc	x28,xzr,xzr		// moved below
    448 	mov	x27,#-8*8
    449 
    450 	//                                                         a[8]a[0]
    451 	//                                                     a[9]a[0]
    452 	//                                                 a[a]a[0]
    453 	//                                             a[b]a[0]
    454 	//                                         a[c]a[0]
    455 	//                                     a[d]a[0]
    456 	//                                 a[e]a[0]
    457 	//                             a[f]a[0]
    458 	//                                                     a[8]a[1]
    459 	//                         a[f]a[1]........................
    460 	//                                                 a[8]a[2]
    461 	//                     a[f]a[2]........................
    462 	//                                             a[8]a[3]
    463 	//                 a[f]a[3]........................
    464 	//                                         a[8]a[4]
    465 	//             a[f]a[4]........................
    466 	//                                     a[8]a[5]
    467 	//         a[f]a[5]........................
    468 	//                                 a[8]a[6]
    469 	//     a[f]a[6]........................
    470 	//                             a[8]a[7]
    471 	// a[f]a[7]........................
    472 .Lsqr8x_mul:
    473 	mul	x14,x6,x4
    474 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    475 	mul	x15,x7,x4
    476 	add	x27,x27,#8
    477 	mul	x16,x8,x4
    478 	mul	x17,x9,x4
    479 	adds	x19,x19,x14
    480 	mul	x14,x10,x4
    481 	adcs	x20,x20,x15
    482 	mul	x15,x11,x4
    483 	adcs	x21,x21,x16
    484 	mul	x16,x12,x4
    485 	adcs	x22,x22,x17
    486 	mul	x17,x13,x4
    487 	adcs	x23,x23,x14
    488 	umulh	x14,x6,x4
    489 	adcs	x24,x24,x15
    490 	umulh	x15,x7,x4
    491 	adcs	x25,x25,x16
    492 	umulh	x16,x8,x4
    493 	adcs	x26,x26,x17
    494 	umulh	x17,x9,x4
    495 	adc	x28,x28,xzr
    496 	str	x19,[x2],#8
    497 	adds	x19,x20,x14
    498 	umulh	x14,x10,x4
    499 	adcs	x20,x21,x15
    500 	umulh	x15,x11,x4
    501 	adcs	x21,x22,x16
    502 	umulh	x16,x12,x4
    503 	adcs	x22,x23,x17
    504 	umulh	x17,x13,x4
    505 	ldr	x4,[x0,x27]
    506 	adcs	x23,x24,x14
    507 	adcs	x24,x25,x15
    508 	adcs	x25,x26,x16
    509 	adcs	x26,x28,x17
    510 	//adc	x28,xzr,xzr		// moved above
    511 	cbnz	x27,.Lsqr8x_mul
    512 					// note that carry flag is guaranteed
    513 					// to be zero at this point
    514 	cmp	x1,x3		// done yet?
    515 	b.eq	.Lsqr8x_break
    516 
    517 	ldp	x6,x7,[x2,#8*0]
    518 	ldp	x8,x9,[x2,#8*2]
    519 	ldp	x10,x11,[x2,#8*4]
    520 	ldp	x12,x13,[x2,#8*6]
    521 	adds	x19,x19,x6
    522 	ldr	x4,[x0,#-8*8]
    523 	adcs	x20,x20,x7
    524 	ldp	x6,x7,[x1,#8*0]
    525 	adcs	x21,x21,x8
    526 	adcs	x22,x22,x9
    527 	ldp	x8,x9,[x1,#8*2]
    528 	adcs	x23,x23,x10
    529 	adcs	x24,x24,x11
    530 	ldp	x10,x11,[x1,#8*4]
    531 	adcs	x25,x25,x12
    532 	mov	x27,#-8*8
    533 	adcs	x26,x26,x13
    534 	ldp	x12,x13,[x1,#8*6]
    535 	add	x1,x1,#8*8
    536 	//adc	x28,xzr,xzr		// moved above
    537 	b	.Lsqr8x_mul
    538 
    539 .align	4
    540 .Lsqr8x_break:
    541 	ldp	x6,x7,[x0,#8*0]
    542 	add	x1,x0,#8*8
    543 	ldp	x8,x9,[x0,#8*2]
    544 	sub	x14,x3,x1		// is it last iteration?
    545 	ldp	x10,x11,[x0,#8*4]
    546 	sub	x15,x2,x14
    547 	ldp	x12,x13,[x0,#8*6]
    548 	cbz	x14,.Lsqr8x_outer_loop
    549 
    550 	stp	x19,x20,[x2,#8*0]
    551 	ldp	x19,x20,[x15,#8*0]
    552 	stp	x21,x22,[x2,#8*2]
    553 	ldp	x21,x22,[x15,#8*2]
    554 	stp	x23,x24,[x2,#8*4]
    555 	ldp	x23,x24,[x15,#8*4]
    556 	stp	x25,x26,[x2,#8*6]
    557 	mov	x2,x15
    558 	ldp	x25,x26,[x15,#8*6]
    559 	b	.Lsqr8x_outer_loop
    560 
    561 .align	4
    562 .Lsqr8x_outer_break:
    563 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
    564 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
    565 	ldp	x15,x16,[sp,#8*1]
    566 	ldp	x11,x13,[x14,#8*2]
    567 	add	x1,x14,#8*4
    568 	ldp	x17,x14,[sp,#8*3]
    569 
    570 	stp	x19,x20,[x2,#8*0]
    571 	mul	x19,x7,x7
    572 	stp	x21,x22,[x2,#8*2]
    573 	umulh	x7,x7,x7
    574 	stp	x23,x24,[x2,#8*4]
    575 	mul	x8,x9,x9
    576 	stp	x25,x26,[x2,#8*6]
    577 	mov	x2,sp
    578 	umulh	x9,x9,x9
    579 	adds	x20,x7,x15,lsl#1
    580 	extr	x15,x16,x15,#63
    581 	sub	x27,x5,#8*4
    582 
    583 .Lsqr4x_shift_n_add:
    584 	adcs	x21,x8,x15
    585 	extr	x16,x17,x16,#63
    586 	sub	x27,x27,#8*4
    587 	adcs	x22,x9,x16
    588 	ldp	x15,x16,[x2,#8*5]
    589 	mul	x10,x11,x11
    590 	ldp	x7,x9,[x1],#8*2
    591 	umulh	x11,x11,x11
    592 	mul	x12,x13,x13
    593 	umulh	x13,x13,x13
    594 	extr	x17,x14,x17,#63
    595 	stp	x19,x20,[x2,#8*0]
    596 	adcs	x23,x10,x17
    597 	extr	x14,x15,x14,#63
    598 	stp	x21,x22,[x2,#8*2]
    599 	adcs	x24,x11,x14
    600 	ldp	x17,x14,[x2,#8*7]
    601 	extr	x15,x16,x15,#63
    602 	adcs	x25,x12,x15
    603 	extr	x16,x17,x16,#63
    604 	adcs	x26,x13,x16
    605 	ldp	x15,x16,[x2,#8*9]
    606 	mul	x6,x7,x7
    607 	ldp	x11,x13,[x1],#8*2
    608 	umulh	x7,x7,x7
    609 	mul	x8,x9,x9
    610 	umulh	x9,x9,x9
    611 	stp	x23,x24,[x2,#8*4]
    612 	extr	x17,x14,x17,#63
    613 	stp	x25,x26,[x2,#8*6]
    614 	add	x2,x2,#8*8
    615 	adcs	x19,x6,x17
    616 	extr	x14,x15,x14,#63
    617 	adcs	x20,x7,x14
    618 	ldp	x17,x14,[x2,#8*3]
    619 	extr	x15,x16,x15,#63
    620 	cbnz	x27,.Lsqr4x_shift_n_add
    621 	ldp	x1,x4,[x29,#104]	// pull np and n0
    622 
    623 	adcs	x21,x8,x15
    624 	extr	x16,x17,x16,#63
    625 	adcs	x22,x9,x16
    626 	ldp	x15,x16,[x2,#8*5]
    627 	mul	x10,x11,x11
    628 	umulh	x11,x11,x11
    629 	stp	x19,x20,[x2,#8*0]
    630 	mul	x12,x13,x13
    631 	umulh	x13,x13,x13
    632 	stp	x21,x22,[x2,#8*2]
    633 	extr	x17,x14,x17,#63
    634 	adcs	x23,x10,x17
    635 	extr	x14,x15,x14,#63
    636 	ldp	x19,x20,[sp,#8*0]
    637 	adcs	x24,x11,x14
    638 	extr	x15,x16,x15,#63
    639 	ldp	x6,x7,[x1,#8*0]
    640 	adcs	x25,x12,x15
    641 	extr	x16,xzr,x16,#63
    642 	ldp	x8,x9,[x1,#8*2]
    643 	adc	x26,x13,x16
    644 	ldp	x10,x11,[x1,#8*4]
    645 
    646 	// Reduce by 512 bits per iteration
    647 	mul	x28,x4,x19		// t[0]*n0
    648 	ldp	x12,x13,[x1,#8*6]
    649 	add	x3,x1,x5
    650 	ldp	x21,x22,[sp,#8*2]
    651 	stp	x23,x24,[x2,#8*4]
    652 	ldp	x23,x24,[sp,#8*4]
    653 	stp	x25,x26,[x2,#8*6]
    654 	ldp	x25,x26,[sp,#8*6]
    655 	add	x1,x1,#8*8
    656 	mov	x30,xzr		// initial top-most carry
    657 	mov	x2,sp
    658 	mov	x27,#8
    659 
    660 .Lsqr8x_reduction:
    661 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
    662 	mul	x15,x7,x28
    663 	sub	x27,x27,#1
    664 	mul	x16,x8,x28
    665 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
    666 	mul	x17,x9,x28
    667 	// (*)	adds	xzr,x19,x14
    668 	subs	xzr,x19,#1		// (*)
    669 	mul	x14,x10,x28
    670 	adcs	x19,x20,x15
    671 	mul	x15,x11,x28
    672 	adcs	x20,x21,x16
    673 	mul	x16,x12,x28
    674 	adcs	x21,x22,x17
    675 	mul	x17,x13,x28
    676 	adcs	x22,x23,x14
    677 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
    678 	adcs	x23,x24,x15
    679 	umulh	x15,x7,x28
    680 	adcs	x24,x25,x16
    681 	umulh	x16,x8,x28
    682 	adcs	x25,x26,x17
    683 	umulh	x17,x9,x28
    684 	adc	x26,xzr,xzr
    685 	adds	x19,x19,x14
    686 	umulh	x14,x10,x28
    687 	adcs	x20,x20,x15
    688 	umulh	x15,x11,x28
    689 	adcs	x21,x21,x16
    690 	umulh	x16,x12,x28
    691 	adcs	x22,x22,x17
    692 	umulh	x17,x13,x28
    693 	mul	x28,x4,x19		// next t[0]*n0
    694 	adcs	x23,x23,x14
    695 	adcs	x24,x24,x15
    696 	adcs	x25,x25,x16
    697 	adc	x26,x26,x17
    698 	cbnz	x27,.Lsqr8x_reduction
    699 
    700 	ldp	x14,x15,[x2,#8*0]
    701 	ldp	x16,x17,[x2,#8*2]
    702 	mov	x0,x2
    703 	sub	x27,x3,x1	// done yet?
    704 	adds	x19,x19,x14
    705 	adcs	x20,x20,x15
    706 	ldp	x14,x15,[x2,#8*4]
    707 	adcs	x21,x21,x16
    708 	adcs	x22,x22,x17
    709 	ldp	x16,x17,[x2,#8*6]
    710 	adcs	x23,x23,x14
    711 	adcs	x24,x24,x15
    712 	adcs	x25,x25,x16
    713 	adcs	x26,x26,x17
    714 	//adc	x28,xzr,xzr		// moved below
    715 	cbz	x27,.Lsqr8x8_post_condition
    716 
    717 	ldr	x4,[x2,#-8*8]
    718 	ldp	x6,x7,[x1,#8*0]
    719 	ldp	x8,x9,[x1,#8*2]
    720 	ldp	x10,x11,[x1,#8*4]
    721 	mov	x27,#-8*8
    722 	ldp	x12,x13,[x1,#8*6]
    723 	add	x1,x1,#8*8
    724 
    725 .Lsqr8x_tail:
    726 	mul	x14,x6,x4
    727 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    728 	mul	x15,x7,x4
    729 	add	x27,x27,#8
    730 	mul	x16,x8,x4
    731 	mul	x17,x9,x4
    732 	adds	x19,x19,x14
    733 	mul	x14,x10,x4
    734 	adcs	x20,x20,x15
    735 	mul	x15,x11,x4
    736 	adcs	x21,x21,x16
    737 	mul	x16,x12,x4
    738 	adcs	x22,x22,x17
    739 	mul	x17,x13,x4
    740 	adcs	x23,x23,x14
    741 	umulh	x14,x6,x4
    742 	adcs	x24,x24,x15
    743 	umulh	x15,x7,x4
    744 	adcs	x25,x25,x16
    745 	umulh	x16,x8,x4
    746 	adcs	x26,x26,x17
    747 	umulh	x17,x9,x4
    748 	adc	x28,x28,xzr
    749 	str	x19,[x2],#8
    750 	adds	x19,x20,x14
    751 	umulh	x14,x10,x4
    752 	adcs	x20,x21,x15
    753 	umulh	x15,x11,x4
    754 	adcs	x21,x22,x16
    755 	umulh	x16,x12,x4
    756 	adcs	x22,x23,x17
    757 	umulh	x17,x13,x4
    758 	ldr	x4,[x0,x27]
    759 	adcs	x23,x24,x14
    760 	adcs	x24,x25,x15
    761 	adcs	x25,x26,x16
    762 	adcs	x26,x28,x17
    763 	//adc	x28,xzr,xzr		// moved above
    764 	cbnz	x27,.Lsqr8x_tail
    765 					// note that carry flag is guaranteed
    766 					// to be zero at this point
    767 	ldp	x6,x7,[x2,#8*0]
    768 	sub	x27,x3,x1	// done yet?
    769 	sub	x16,x3,x5	// rewinded np
    770 	ldp	x8,x9,[x2,#8*2]
    771 	ldp	x10,x11,[x2,#8*4]
    772 	ldp	x12,x13,[x2,#8*6]
    773 	cbz	x27,.Lsqr8x_tail_break
    774 
    775 	ldr	x4,[x0,#-8*8]
    776 	adds	x19,x19,x6
    777 	adcs	x20,x20,x7
    778 	ldp	x6,x7,[x1,#8*0]
    779 	adcs	x21,x21,x8
    780 	adcs	x22,x22,x9
    781 	ldp	x8,x9,[x1,#8*2]
    782 	adcs	x23,x23,x10
    783 	adcs	x24,x24,x11
    784 	ldp	x10,x11,[x1,#8*4]
    785 	adcs	x25,x25,x12
    786 	mov	x27,#-8*8
    787 	adcs	x26,x26,x13
    788 	ldp	x12,x13,[x1,#8*6]
    789 	add	x1,x1,#8*8
    790 	//adc	x28,xzr,xzr		// moved above
    791 	b	.Lsqr8x_tail
    792 
    793 .align	4
    794 .Lsqr8x_tail_break:
    795 	ldr	x4,[x29,#112]		// pull n0
    796 	add	x27,x2,#8*8		// end of current t[num] window
    797 
    798 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
    799 	adcs	x14,x19,x6
    800 	adcs	x15,x20,x7
    801 	ldp	x19,x20,[x0,#8*0]
    802 	adcs	x21,x21,x8
    803 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
    804 	adcs	x22,x22,x9
    805 	ldp	x8,x9,[x16,#8*2]
    806 	adcs	x23,x23,x10
    807 	adcs	x24,x24,x11
    808 	ldp	x10,x11,[x16,#8*4]
    809 	adcs	x25,x25,x12
    810 	adcs	x26,x26,x13
    811 	ldp	x12,x13,[x16,#8*6]
    812 	add	x1,x16,#8*8
    813 	adc	x30,xzr,xzr	// top-most carry
    814 	mul	x28,x4,x19
    815 	stp	x14,x15,[x2,#8*0]
    816 	stp	x21,x22,[x2,#8*2]
    817 	ldp	x21,x22,[x0,#8*2]
    818 	stp	x23,x24,[x2,#8*4]
    819 	ldp	x23,x24,[x0,#8*4]
    820 	cmp	x27,x29		// did we hit the bottom?
    821 	stp	x25,x26,[x2,#8*6]
    822 	mov	x2,x0			// slide the window
    823 	ldp	x25,x26,[x0,#8*6]
    824 	mov	x27,#8
    825 	b.ne	.Lsqr8x_reduction
    826 
    827 	// Final step. We see if result is larger than modulus, and
    828 	// if it is, subtract the modulus. But comparison implies
    829 	// subtraction. So we subtract modulus, see if it borrowed,
    830 	// and conditionally copy original value.
    831 	ldr	x0,[x29,#96]		// pull rp
    832 	add	x2,x2,#8*8
    833 	subs	x14,x19,x6
    834 	sbcs	x15,x20,x7
    835 	sub	x27,x5,#8*8
    836 	mov	x3,x0		// x0 copy
    837 
    838 .Lsqr8x_sub:
    839 	sbcs	x16,x21,x8
    840 	ldp	x6,x7,[x1,#8*0]
    841 	sbcs	x17,x22,x9
    842 	stp	x14,x15,[x0,#8*0]
    843 	sbcs	x14,x23,x10
    844 	ldp	x8,x9,[x1,#8*2]
    845 	sbcs	x15,x24,x11
    846 	stp	x16,x17,[x0,#8*2]
    847 	sbcs	x16,x25,x12
    848 	ldp	x10,x11,[x1,#8*4]
    849 	sbcs	x17,x26,x13
    850 	ldp	x12,x13,[x1,#8*6]
    851 	add	x1,x1,#8*8
    852 	ldp	x19,x20,[x2,#8*0]
    853 	sub	x27,x27,#8*8
    854 	ldp	x21,x22,[x2,#8*2]
    855 	ldp	x23,x24,[x2,#8*4]
    856 	ldp	x25,x26,[x2,#8*6]
    857 	add	x2,x2,#8*8
    858 	stp	x14,x15,[x0,#8*4]
    859 	sbcs	x14,x19,x6
    860 	stp	x16,x17,[x0,#8*6]
    861 	add	x0,x0,#8*8
    862 	sbcs	x15,x20,x7
    863 	cbnz	x27,.Lsqr8x_sub
    864 
    865 	sbcs	x16,x21,x8
    866 	mov	x2,sp
    867 	add	x1,sp,x5
    868 	ldp	x6,x7,[x3,#8*0]
    869 	sbcs	x17,x22,x9
    870 	stp	x14,x15,[x0,#8*0]
    871 	sbcs	x14,x23,x10
    872 	ldp	x8,x9,[x3,#8*2]
    873 	sbcs	x15,x24,x11
    874 	stp	x16,x17,[x0,#8*2]
    875 	sbcs	x16,x25,x12
    876 	ldp	x19,x20,[x1,#8*0]
    877 	sbcs	x17,x26,x13
    878 	ldp	x21,x22,[x1,#8*2]
    879 	sbcs	xzr,x30,xzr	// did it borrow?
    880 	ldr	x30,[x29,#8]		// pull return address
    881 	stp	x14,x15,[x0,#8*4]
    882 	stp	x16,x17,[x0,#8*6]
    883 
    884 	sub	x27,x5,#8*4
    885 .Lsqr4x_cond_copy:
    886 	sub	x27,x27,#8*4
    887 	csel	x14,x19,x6,lo
    888 	stp	xzr,xzr,[x2,#8*0]
    889 	csel	x15,x20,x7,lo
    890 	ldp	x6,x7,[x3,#8*4]
    891 	ldp	x19,x20,[x1,#8*4]
    892 	csel	x16,x21,x8,lo
    893 	stp	xzr,xzr,[x2,#8*2]
    894 	add	x2,x2,#8*4
    895 	csel	x17,x22,x9,lo
    896 	ldp	x8,x9,[x3,#8*6]
    897 	ldp	x21,x22,[x1,#8*6]
    898 	add	x1,x1,#8*4
    899 	stp	x14,x15,[x3,#8*0]
    900 	stp	x16,x17,[x3,#8*2]
    901 	add	x3,x3,#8*4
    902 	stp	xzr,xzr,[x1,#8*0]
    903 	stp	xzr,xzr,[x1,#8*2]
    904 	cbnz	x27,.Lsqr4x_cond_copy
    905 
    906 	csel	x14,x19,x6,lo
    907 	stp	xzr,xzr,[x2,#8*0]
    908 	csel	x15,x20,x7,lo
    909 	stp	xzr,xzr,[x2,#8*2]
    910 	csel	x16,x21,x8,lo
    911 	csel	x17,x22,x9,lo
    912 	stp	x14,x15,[x3,#8*0]
    913 	stp	x16,x17,[x3,#8*2]
    914 
    915 	b	.Lsqr8x_done
    916 
    917 .align	4
    918 .Lsqr8x8_post_condition:
    919 	adc	x28,xzr,xzr
    920 	ldr	x30,[x29,#8]		// pull return address
    921 	// x19-7,x28 hold result, x6-7 hold modulus
    922 	subs	x6,x19,x6
    923 	ldr	x1,[x29,#96]		// pull rp
    924 	sbcs	x7,x20,x7
    925 	stp	xzr,xzr,[sp,#8*0]
    926 	sbcs	x8,x21,x8
    927 	stp	xzr,xzr,[sp,#8*2]
    928 	sbcs	x9,x22,x9
    929 	stp	xzr,xzr,[sp,#8*4]
    930 	sbcs	x10,x23,x10
    931 	stp	xzr,xzr,[sp,#8*6]
    932 	sbcs	x11,x24,x11
    933 	stp	xzr,xzr,[sp,#8*8]
    934 	sbcs	x12,x25,x12
    935 	stp	xzr,xzr,[sp,#8*10]
    936 	sbcs	x13,x26,x13
    937 	stp	xzr,xzr,[sp,#8*12]
    938 	sbcs	x28,x28,xzr	// did it borrow?
    939 	stp	xzr,xzr,[sp,#8*14]
    940 
    941 	// x6-7 hold result-modulus
    942 	csel	x6,x19,x6,lo
    943 	csel	x7,x20,x7,lo
    944 	csel	x8,x21,x8,lo
    945 	csel	x9,x22,x9,lo
    946 	stp	x6,x7,[x1,#8*0]
    947 	csel	x10,x23,x10,lo
    948 	csel	x11,x24,x11,lo
    949 	stp	x8,x9,[x1,#8*2]
    950 	csel	x12,x25,x12,lo
    951 	csel	x13,x26,x13,lo
    952 	stp	x10,x11,[x1,#8*4]
    953 	stp	x12,x13,[x1,#8*6]
    954 
    955 .Lsqr8x_done:
    956 	ldp	x19,x20,[x29,#16]
    957 	mov	sp,x29
    958 	ldp	x21,x22,[x29,#32]
    959 	mov	x0,#1
    960 	ldp	x23,x24,[x29,#48]
    961 	ldp	x25,x26,[x29,#64]
    962 	ldp	x27,x28,[x29,#80]
    963 	ldr	x29,[sp],#128
    964 	ret
    965 .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
    966 .type	__bn_mul4x_mont,%function
    967 .align	5
    968 __bn_mul4x_mont:
    969 	stp	x29,x30,[sp,#-128]!
    970 	add	x29,sp,#0
    971 	stp	x19,x20,[sp,#16]
    972 	stp	x21,x22,[sp,#32]
    973 	stp	x23,x24,[sp,#48]
    974 	stp	x25,x26,[sp,#64]
    975 	stp	x27,x28,[sp,#80]
    976 
    977 	sub	x26,sp,x5,lsl#3
    978 	lsl	x5,x5,#3
    979 	ldr	x4,[x4]		// *n0
    980 	sub	sp,x26,#8*4		// alloca
    981 
    982 	add	x10,x2,x5
    983 	add	x27,x1,x5
    984 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
    985 
    986 	ldr	x24,[x2,#8*0]		// b[0]
    987 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
    988 	ldp	x8,x9,[x1,#8*2]
    989 	add	x1,x1,#8*4
    990 	mov	x19,xzr
    991 	mov	x20,xzr
    992 	mov	x21,xzr
    993 	mov	x22,xzr
    994 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
    995 	ldp	x16,x17,[x3,#8*2]
    996 	adds	x3,x3,#8*4		// clear carry bit
    997 	mov	x0,xzr
    998 	mov	x28,#0
    999 	mov	x26,sp
   1000 
   1001 .Loop_mul4x_1st_reduction:
   1002 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
   1003 	adc	x0,x0,xzr	// modulo-scheduled
   1004 	mul	x11,x7,x24
   1005 	add	x28,x28,#8
   1006 	mul	x12,x8,x24
   1007 	and	x28,x28,#31
   1008 	mul	x13,x9,x24
   1009 	adds	x19,x19,x10
   1010 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
   1011 	adcs	x20,x20,x11
   1012 	mul	x25,x19,x4		// t[0]*n0
   1013 	adcs	x21,x21,x12
   1014 	umulh	x11,x7,x24
   1015 	adcs	x22,x22,x13
   1016 	umulh	x12,x8,x24
   1017 	adc	x23,xzr,xzr
   1018 	umulh	x13,x9,x24
   1019 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1020 	adds	x20,x20,x10
   1021 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
   1022 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1023 	adcs	x21,x21,x11
   1024 	mul	x11,x15,x25
   1025 	adcs	x22,x22,x12
   1026 	mul	x12,x16,x25
   1027 	adc	x23,x23,x13		// can't overflow
   1028 	mul	x13,x17,x25
   1029 	// (*)	adds	xzr,x19,x10
   1030 	subs	xzr,x19,#1		// (*)
   1031 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
   1032 	adcs	x19,x20,x11
   1033 	umulh	x11,x15,x25
   1034 	adcs	x20,x21,x12
   1035 	umulh	x12,x16,x25
   1036 	adcs	x21,x22,x13
   1037 	umulh	x13,x17,x25
   1038 	adcs	x22,x23,x0
   1039 	adc	x0,xzr,xzr
   1040 	adds	x19,x19,x10
   1041 	sub	x10,x27,x1
   1042 	adcs	x20,x20,x11
   1043 	adcs	x21,x21,x12
   1044 	adcs	x22,x22,x13
   1045 	//adc	x0,x0,xzr
   1046 	cbnz	x28,.Loop_mul4x_1st_reduction
   1047 
   1048 	cbz	x10,.Lmul4x4_post_condition
   1049 
   1050 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1051 	ldp	x8,x9,[x1,#8*2]
   1052 	add	x1,x1,#8*4
   1053 	ldr	x25,[sp]		// a[0]*n0
   1054 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1055 	ldp	x16,x17,[x3,#8*2]
   1056 	add	x3,x3,#8*4
   1057 
   1058 .Loop_mul4x_1st_tail:
   1059 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
   1060 	adc	x0,x0,xzr	// modulo-scheduled
   1061 	mul	x11,x7,x24
   1062 	add	x28,x28,#8
   1063 	mul	x12,x8,x24
   1064 	and	x28,x28,#31
   1065 	mul	x13,x9,x24
   1066 	adds	x19,x19,x10
   1067 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
   1068 	adcs	x20,x20,x11
   1069 	umulh	x11,x7,x24
   1070 	adcs	x21,x21,x12
   1071 	umulh	x12,x8,x24
   1072 	adcs	x22,x22,x13
   1073 	umulh	x13,x9,x24
   1074 	adc	x23,xzr,xzr
   1075 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1076 	adds	x20,x20,x10
   1077 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
   1078 	adcs	x21,x21,x11
   1079 	mul	x11,x15,x25
   1080 	adcs	x22,x22,x12
   1081 	mul	x12,x16,x25
   1082 	adc	x23,x23,x13		// can't overflow
   1083 	mul	x13,x17,x25
   1084 	adds	x19,x19,x10
   1085 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
   1086 	adcs	x20,x20,x11
   1087 	umulh	x11,x15,x25
   1088 	adcs	x21,x21,x12
   1089 	umulh	x12,x16,x25
   1090 	adcs	x22,x22,x13
   1091 	adcs	x23,x23,x0
   1092 	umulh	x13,x17,x25
   1093 	adc	x0,xzr,xzr
   1094 	ldr	x25,[sp,x28]		// next t[0]*n0
   1095 	str	x19,[x26],#8		// result!!!
   1096 	adds	x19,x20,x10
   1097 	sub	x10,x27,x1		// done yet?
   1098 	adcs	x20,x21,x11
   1099 	adcs	x21,x22,x12
   1100 	adcs	x22,x23,x13
   1101 	//adc	x0,x0,xzr
   1102 	cbnz	x28,.Loop_mul4x_1st_tail
   1103 
   1104 	sub	x11,x27,x5	// rewinded x1
   1105 	cbz	x10,.Lmul4x_proceed
   1106 
   1107 	ldp	x6,x7,[x1,#8*0]
   1108 	ldp	x8,x9,[x1,#8*2]
   1109 	add	x1,x1,#8*4
   1110 	ldp	x14,x15,[x3,#8*0]
   1111 	ldp	x16,x17,[x3,#8*2]
   1112 	add	x3,x3,#8*4
   1113 	b	.Loop_mul4x_1st_tail
   1114 
   1115 .align	5
   1116 .Lmul4x_proceed:
   1117 	ldr	x24,[x2,#8*4]!		// *++b
   1118 	adc	x30,x0,xzr
   1119 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
   1120 	sub	x3,x3,x5		// rewind np
   1121 	ldp	x8,x9,[x11,#8*2]
   1122 	add	x1,x11,#8*4
   1123 
   1124 	stp	x19,x20,[x26,#8*0]	// result!!!
   1125 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1126 	stp	x21,x22,[x26,#8*2]	// result!!!
   1127 	ldp	x21,x22,[sp,#8*6]
   1128 
   1129 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1130 	mov	x26,sp
   1131 	ldp	x16,x17,[x3,#8*2]
   1132 	adds	x3,x3,#8*4		// clear carry bit
   1133 	mov	x0,xzr
   1134 
   1135 .align	4
   1136 .Loop_mul4x_reduction:
   1137 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
   1138 	adc	x0,x0,xzr	// modulo-scheduled
   1139 	mul	x11,x7,x24
   1140 	add	x28,x28,#8
   1141 	mul	x12,x8,x24
   1142 	and	x28,x28,#31
   1143 	mul	x13,x9,x24
   1144 	adds	x19,x19,x10
   1145 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
   1146 	adcs	x20,x20,x11
   1147 	mul	x25,x19,x4		// t[0]*n0
   1148 	adcs	x21,x21,x12
   1149 	umulh	x11,x7,x24
   1150 	adcs	x22,x22,x13
   1151 	umulh	x12,x8,x24
   1152 	adc	x23,xzr,xzr
   1153 	umulh	x13,x9,x24
   1154 	ldr	x24,[x2,x28]		// next b[i]
   1155 	adds	x20,x20,x10
   1156 	// (*)	mul	x10,x14,x25
   1157 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1158 	adcs	x21,x21,x11
   1159 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
   1160 	adcs	x22,x22,x12
   1161 	mul	x12,x16,x25
   1162 	adc	x23,x23,x13		// can't overflow
   1163 	mul	x13,x17,x25
   1164 	// (*)	adds	xzr,x19,x10
   1165 	subs	xzr,x19,#1		// (*)
   1166 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
   1167 	adcs	x19,x20,x11
   1168 	umulh	x11,x15,x25
   1169 	adcs	x20,x21,x12
   1170 	umulh	x12,x16,x25
   1171 	adcs	x21,x22,x13
   1172 	umulh	x13,x17,x25
   1173 	adcs	x22,x23,x0
   1174 	adc	x0,xzr,xzr
   1175 	adds	x19,x19,x10
   1176 	adcs	x20,x20,x11
   1177 	adcs	x21,x21,x12
   1178 	adcs	x22,x22,x13
   1179 	//adc	x0,x0,xzr
   1180 	cbnz	x28,.Loop_mul4x_reduction
   1181 
   1182 	adc	x0,x0,xzr
   1183 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
   1184 	ldp	x12,x13,[x26,#8*6]
   1185 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1186 	ldp	x8,x9,[x1,#8*2]
   1187 	add	x1,x1,#8*4
   1188 	adds	x19,x19,x10
   1189 	adcs	x20,x20,x11
   1190 	adcs	x21,x21,x12
   1191 	adcs	x22,x22,x13
   1192 	//adc	x0,x0,xzr
   1193 
   1194 	ldr	x25,[sp]		// t[0]*n0
   1195 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1196 	ldp	x16,x17,[x3,#8*2]
   1197 	add	x3,x3,#8*4
   1198 
   1199 .align	4
   1200 .Loop_mul4x_tail:
   1201 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
   1202 	adc	x0,x0,xzr	// modulo-scheduled
   1203 	mul	x11,x7,x24
   1204 	add	x28,x28,#8
   1205 	mul	x12,x8,x24
   1206 	and	x28,x28,#31
   1207 	mul	x13,x9,x24
   1208 	adds	x19,x19,x10
   1209 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
   1210 	adcs	x20,x20,x11
   1211 	umulh	x11,x7,x24
   1212 	adcs	x21,x21,x12
   1213 	umulh	x12,x8,x24
   1214 	adcs	x22,x22,x13
   1215 	umulh	x13,x9,x24
   1216 	adc	x23,xzr,xzr
   1217 	ldr	x24,[x2,x28]		// next b[i]
   1218 	adds	x20,x20,x10
   1219 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
   1220 	adcs	x21,x21,x11
   1221 	mul	x11,x15,x25
   1222 	adcs	x22,x22,x12
   1223 	mul	x12,x16,x25
   1224 	adc	x23,x23,x13		// can't overflow
   1225 	mul	x13,x17,x25
   1226 	adds	x19,x19,x10
   1227 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
   1228 	adcs	x20,x20,x11
   1229 	umulh	x11,x15,x25
   1230 	adcs	x21,x21,x12
   1231 	umulh	x12,x16,x25
   1232 	adcs	x22,x22,x13
   1233 	umulh	x13,x17,x25
   1234 	adcs	x23,x23,x0
   1235 	ldr	x25,[sp,x28]		// next a[0]*n0
   1236 	adc	x0,xzr,xzr
   1237 	str	x19,[x26],#8		// result!!!
   1238 	adds	x19,x20,x10
   1239 	sub	x10,x27,x1		// done yet?
   1240 	adcs	x20,x21,x11
   1241 	adcs	x21,x22,x12
   1242 	adcs	x22,x23,x13
   1243 	//adc	x0,x0,xzr
   1244 	cbnz	x28,.Loop_mul4x_tail
   1245 
   1246 	sub	x11,x3,x5		// rewinded np?
   1247 	adc	x0,x0,xzr
   1248 	cbz	x10,.Loop_mul4x_break
   1249 
   1250 	ldp	x10,x11,[x26,#8*4]
   1251 	ldp	x12,x13,[x26,#8*6]
   1252 	ldp	x6,x7,[x1,#8*0]
   1253 	ldp	x8,x9,[x1,#8*2]
   1254 	add	x1,x1,#8*4
   1255 	adds	x19,x19,x10
   1256 	adcs	x20,x20,x11
   1257 	adcs	x21,x21,x12
   1258 	adcs	x22,x22,x13
   1259 	//adc	x0,x0,xzr
   1260 	ldp	x14,x15,[x3,#8*0]
   1261 	ldp	x16,x17,[x3,#8*2]
   1262 	add	x3,x3,#8*4
   1263 	b	.Loop_mul4x_tail
   1264 
   1265 .align	4
   1266 .Loop_mul4x_break:
   1267 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
   1268 	adds	x19,x19,x30
   1269 	add	x2,x2,#8*4		// bp++
   1270 	adcs	x20,x20,xzr
   1271 	sub	x1,x1,x5		// rewind ap
   1272 	adcs	x21,x21,xzr
   1273 	stp	x19,x20,[x26,#8*0]	// result!!!
   1274 	adcs	x22,x22,xzr
   1275 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1276 	adc	x30,x0,xzr
   1277 	stp	x21,x22,[x26,#8*2]	// result!!!
   1278 	cmp	x2,x13			// done yet?
   1279 	ldp	x21,x22,[sp,#8*6]
   1280 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
   1281 	ldp	x16,x17,[x11,#8*2]
   1282 	add	x3,x11,#8*4
   1283 	b.eq	.Lmul4x_post
   1284 
   1285 	ldr	x24,[x2]
   1286 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1287 	ldp	x8,x9,[x1,#8*2]
   1288 	adds	x1,x1,#8*4		// clear carry bit
   1289 	mov	x0,xzr
   1290 	mov	x26,sp
   1291 	b	.Loop_mul4x_reduction
   1292 
   1293 .align	4
   1294 .Lmul4x_post:
   1295 	// Final step. We see if result is larger than modulus, and
   1296 	// if it is, subtract the modulus. But comparison implies
   1297 	// subtraction. So we subtract modulus, see if it borrowed,
   1298 	// and conditionally copy original value.
   1299 	mov	x0,x12
   1300 	mov	x27,x12		// x0 copy
   1301 	subs	x10,x19,x14
   1302 	add	x26,sp,#8*8
   1303 	sbcs	x11,x20,x15
   1304 	sub	x28,x5,#8*4
   1305 
   1306 .Lmul4x_sub:
   1307 	sbcs	x12,x21,x16
   1308 	ldp	x14,x15,[x3,#8*0]
   1309 	sub	x28,x28,#8*4
   1310 	ldp	x19,x20,[x26,#8*0]
   1311 	sbcs	x13,x22,x17
   1312 	ldp	x16,x17,[x3,#8*2]
   1313 	add	x3,x3,#8*4
   1314 	ldp	x21,x22,[x26,#8*2]
   1315 	add	x26,x26,#8*4
   1316 	stp	x10,x11,[x0,#8*0]
   1317 	sbcs	x10,x19,x14
   1318 	stp	x12,x13,[x0,#8*2]
   1319 	add	x0,x0,#8*4
   1320 	sbcs	x11,x20,x15
   1321 	cbnz	x28,.Lmul4x_sub
   1322 
   1323 	sbcs	x12,x21,x16
   1324 	mov	x26,sp
   1325 	add	x1,sp,#8*4
   1326 	ldp	x6,x7,[x27,#8*0]
   1327 	sbcs	x13,x22,x17
   1328 	stp	x10,x11,[x0,#8*0]
   1329 	ldp	x8,x9,[x27,#8*2]
   1330 	stp	x12,x13,[x0,#8*2]
   1331 	ldp	x19,x20,[x1,#8*0]
   1332 	ldp	x21,x22,[x1,#8*2]
   1333 	sbcs	xzr,x30,xzr	// did it borrow?
   1334 	ldr	x30,[x29,#8]		// pull return address
   1335 
   1336 	sub	x28,x5,#8*4
   1337 .Lmul4x_cond_copy:
   1338 	sub	x28,x28,#8*4
   1339 	csel	x10,x19,x6,lo
   1340 	stp	xzr,xzr,[x26,#8*0]
   1341 	csel	x11,x20,x7,lo
   1342 	ldp	x6,x7,[x27,#8*4]
   1343 	ldp	x19,x20,[x1,#8*4]
   1344 	csel	x12,x21,x8,lo
   1345 	stp	xzr,xzr,[x26,#8*2]
   1346 	add	x26,x26,#8*4
   1347 	csel	x13,x22,x9,lo
   1348 	ldp	x8,x9,[x27,#8*6]
   1349 	ldp	x21,x22,[x1,#8*6]
   1350 	add	x1,x1,#8*4
   1351 	stp	x10,x11,[x27,#8*0]
   1352 	stp	x12,x13,[x27,#8*2]
   1353 	add	x27,x27,#8*4
   1354 	cbnz	x28,.Lmul4x_cond_copy
   1355 
   1356 	csel	x10,x19,x6,lo
   1357 	stp	xzr,xzr,[x26,#8*0]
   1358 	csel	x11,x20,x7,lo
   1359 	stp	xzr,xzr,[x26,#8*2]
   1360 	csel	x12,x21,x8,lo
   1361 	stp	xzr,xzr,[x26,#8*3]
   1362 	csel	x13,x22,x9,lo
   1363 	stp	xzr,xzr,[x26,#8*4]
   1364 	stp	x10,x11,[x27,#8*0]
   1365 	stp	x12,x13,[x27,#8*2]
   1366 
   1367 	b	.Lmul4x_done
   1368 
   1369 .align	4
   1370 .Lmul4x4_post_condition:
   1371 	adc	x0,x0,xzr
   1372 	ldr	x1,[x29,#96]		// pull rp
   1373 	// x19-3,x0 hold result, x14-7 hold modulus
   1374 	subs	x6,x19,x14
   1375 	ldr	x30,[x29,#8]		// pull return address
   1376 	sbcs	x7,x20,x15
   1377 	stp	xzr,xzr,[sp,#8*0]
   1378 	sbcs	x8,x21,x16
   1379 	stp	xzr,xzr,[sp,#8*2]
   1380 	sbcs	x9,x22,x17
   1381 	stp	xzr,xzr,[sp,#8*4]
   1382 	sbcs	xzr,x0,xzr		// did it borrow?
   1383 	stp	xzr,xzr,[sp,#8*6]
   1384 
   1385 	// x6-3 hold result-modulus
   1386 	csel	x6,x19,x6,lo
   1387 	csel	x7,x20,x7,lo
   1388 	csel	x8,x21,x8,lo
   1389 	csel	x9,x22,x9,lo
   1390 	stp	x6,x7,[x1,#8*0]
   1391 	stp	x8,x9,[x1,#8*2]
   1392 
   1393 .Lmul4x_done:
   1394 	ldp	x19,x20,[x29,#16]
   1395 	mov	sp,x29
   1396 	ldp	x21,x22,[x29,#32]
   1397 	mov	x0,#1
   1398 	ldp	x23,x24,[x29,#48]
   1399 	ldp	x25,x26,[x29,#64]
   1400 	ldp	x27,x28,[x29,#80]
   1401 	ldr	x29,[sp],#128
   1402 	ret
   1403 .size	__bn_mul4x_mont,.-__bn_mul4x_mont
   1404 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1405 .align	2
   1406 .align	4
   1407 #endif
   1408