Home | History | Annotate | Download | only in fipsmodule
      1 .text
      2 
      3 .globl	_bn_mul_mont
      4 .private_extern	_bn_mul_mont
      5 
      6 .align	5
      7 _bn_mul_mont:
      8 	tst	x5,#7
      9 	b.eq	__bn_sqr8x_mont
     10 	tst	x5,#3
     11 	b.eq	__bn_mul4x_mont
     12 Lmul_mont:
     13 	stp	x29,x30,[sp,#-64]!
     14 	add	x29,sp,#0
     15 	stp	x19,x20,[sp,#16]
     16 	stp	x21,x22,[sp,#32]
     17 	stp	x23,x24,[sp,#48]
     18 
     19 	ldr	x9,[x2],#8		// bp[0]
     20 	sub	x22,sp,x5,lsl#3
     21 	ldp	x7,x8,[x1],#16	// ap[0..1]
     22 	lsl	x5,x5,#3
     23 	ldr	x4,[x4]		// *n0
     24 	and	x22,x22,#-16		// ABI says so
     25 	ldp	x13,x14,[x3],#16	// np[0..1]
     26 
     27 	mul	x6,x7,x9		// ap[0]*bp[0]
     28 	sub	x21,x5,#16		// j=num-2
     29 	umulh	x7,x7,x9
     30 	mul	x10,x8,x9		// ap[1]*bp[0]
     31 	umulh	x11,x8,x9
     32 
     33 	mul	x15,x6,x4		// "tp[0]"*n0
     34 	mov	sp,x22			// alloca
     35 
     36 	// (*)	mul	x12,x13,x15	// np[0]*m1
     37 	umulh	x13,x13,x15
     38 	mul	x16,x14,x15		// np[1]*m1
     39 	// (*)	adds	x12,x12,x6	// discarded
     40 	// (*)	As for removal of first multiplication and addition
     41 	//	instructions. The outcome of first addition is
     42 	//	guaranteed to be zero, which leaves two computationally
     43 	//	significant outcomes: it either carries or not. Then
     44 	//	question is when does it carry? Is there alternative
     45 	//	way to deduce it? If you follow operations, you can
     46 	//	observe that condition for carry is quite simple:
     47 	//	x6 being non-zero. So that carry can be calculated
     48 	//	by adding -1 to x6. That's what next instruction does.
     49 	subs	xzr,x6,#1		// (*)
     50 	umulh	x17,x14,x15
     51 	adc	x13,x13,xzr
     52 	cbz	x21,L1st_skip
     53 
     54 L1st:
     55 	ldr	x8,[x1],#8
     56 	adds	x6,x10,x7
     57 	sub	x21,x21,#8		// j--
     58 	adc	x7,x11,xzr
     59 
     60 	ldr	x14,[x3],#8
     61 	adds	x12,x16,x13
     62 	mul	x10,x8,x9		// ap[j]*bp[0]
     63 	adc	x13,x17,xzr
     64 	umulh	x11,x8,x9
     65 
     66 	adds	x12,x12,x6
     67 	mul	x16,x14,x15		// np[j]*m1
     68 	adc	x13,x13,xzr
     69 	umulh	x17,x14,x15
     70 	str	x12,[x22],#8		// tp[j-1]
     71 	cbnz	x21,L1st
     72 
     73 L1st_skip:
     74 	adds	x6,x10,x7
     75 	sub	x1,x1,x5		// rewind x1
     76 	adc	x7,x11,xzr
     77 
     78 	adds	x12,x16,x13
     79 	sub	x3,x3,x5		// rewind x3
     80 	adc	x13,x17,xzr
     81 
     82 	adds	x12,x12,x6
     83 	sub	x20,x5,#8		// i=num-1
     84 	adcs	x13,x13,x7
     85 
     86 	adc	x19,xzr,xzr		// upmost overflow bit
     87 	stp	x12,x13,[x22]
     88 
     89 Louter:
     90 	ldr	x9,[x2],#8		// bp[i]
     91 	ldp	x7,x8,[x1],#16
     92 	ldr	x23,[sp]		// tp[0]
     93 	add	x22,sp,#8
     94 
     95 	mul	x6,x7,x9		// ap[0]*bp[i]
     96 	sub	x21,x5,#16		// j=num-2
     97 	umulh	x7,x7,x9
     98 	ldp	x13,x14,[x3],#16
     99 	mul	x10,x8,x9		// ap[1]*bp[i]
    100 	adds	x6,x6,x23
    101 	umulh	x11,x8,x9
    102 	adc	x7,x7,xzr
    103 
    104 	mul	x15,x6,x4
    105 	sub	x20,x20,#8		// i--
    106 
    107 	// (*)	mul	x12,x13,x15	// np[0]*m1
    108 	umulh	x13,x13,x15
    109 	mul	x16,x14,x15		// np[1]*m1
    110 	// (*)	adds	x12,x12,x6
    111 	subs	xzr,x6,#1		// (*)
    112 	umulh	x17,x14,x15
    113 	cbz	x21,Linner_skip
    114 
    115 Linner:
    116 	ldr	x8,[x1],#8
    117 	adc	x13,x13,xzr
    118 	ldr	x23,[x22],#8		// tp[j]
    119 	adds	x6,x10,x7
    120 	sub	x21,x21,#8		// j--
    121 	adc	x7,x11,xzr
    122 
    123 	adds	x12,x16,x13
    124 	ldr	x14,[x3],#8
    125 	adc	x13,x17,xzr
    126 
    127 	mul	x10,x8,x9		// ap[j]*bp[i]
    128 	adds	x6,x6,x23
    129 	umulh	x11,x8,x9
    130 	adc	x7,x7,xzr
    131 
    132 	mul	x16,x14,x15		// np[j]*m1
    133 	adds	x12,x12,x6
    134 	umulh	x17,x14,x15
    135 	str	x12,[x22,#-16]		// tp[j-1]
    136 	cbnz	x21,Linner
    137 
    138 Linner_skip:
    139 	ldr	x23,[x22],#8		// tp[j]
    140 	adc	x13,x13,xzr
    141 	adds	x6,x10,x7
    142 	sub	x1,x1,x5		// rewind x1
    143 	adc	x7,x11,xzr
    144 
    145 	adds	x12,x16,x13
    146 	sub	x3,x3,x5		// rewind x3
    147 	adcs	x13,x17,x19
    148 	adc	x19,xzr,xzr
    149 
    150 	adds	x6,x6,x23
    151 	adc	x7,x7,xzr
    152 
    153 	adds	x12,x12,x6
    154 	adcs	x13,x13,x7
    155 	adc	x19,x19,xzr		// upmost overflow bit
    156 	stp	x12,x13,[x22,#-16]
    157 
    158 	cbnz	x20,Louter
    159 
    160 	// Final step. We see if result is larger than modulus, and
    161 	// if it is, subtract the modulus. But comparison implies
    162 	// subtraction. So we subtract modulus, see if it borrowed,
    163 	// and conditionally copy original value.
    164 	ldr	x23,[sp]		// tp[0]
    165 	add	x22,sp,#8
    166 	ldr	x14,[x3],#8		// np[0]
    167 	subs	x21,x5,#8		// j=num-1 and clear borrow
    168 	mov	x1,x0
    169 Lsub:
    170 	sbcs	x8,x23,x14		// tp[j]-np[j]
    171 	ldr	x23,[x22],#8
    172 	sub	x21,x21,#8		// j--
    173 	ldr	x14,[x3],#8
    174 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
    175 	cbnz	x21,Lsub
    176 
    177 	sbcs	x8,x23,x14
    178 	sbcs	x19,x19,xzr		// did it borrow?
    179 	str	x8,[x1],#8		// rp[num-1]
    180 
    181 	ldr	x23,[sp]		// tp[0]
    182 	add	x22,sp,#8
    183 	ldr	x8,[x0],#8		// rp[0]
    184 	sub	x5,x5,#8		// num--
    185 	nop
    186 Lcond_copy:
    187 	sub	x5,x5,#8		// num--
    188 	csel	x14,x23,x8,lo		// did it borrow?
    189 	ldr	x23,[x22],#8
    190 	ldr	x8,[x0],#8
    191 	str	xzr,[x22,#-16]		// wipe tp
    192 	str	x14,[x0,#-16]
    193 	cbnz	x5,Lcond_copy
    194 
    195 	csel	x14,x23,x8,lo
    196 	str	xzr,[x22,#-8]		// wipe tp
    197 	str	x14,[x0,#-8]
    198 
    199 	ldp	x19,x20,[x29,#16]
    200 	mov	sp,x29
    201 	ldp	x21,x22,[x29,#32]
    202 	mov	x0,#1
    203 	ldp	x23,x24,[x29,#48]
    204 	ldr	x29,[sp],#64
    205 	ret
    206 
    207 
    208 .align	5
    209 __bn_sqr8x_mont:
    210 	cmp	x1,x2
    211 	b.ne	__bn_mul4x_mont
    212 Lsqr8x_mont:
    213 	stp	x29,x30,[sp,#-128]!
    214 	add	x29,sp,#0
    215 	stp	x19,x20,[sp,#16]
    216 	stp	x21,x22,[sp,#32]
    217 	stp	x23,x24,[sp,#48]
    218 	stp	x25,x26,[sp,#64]
    219 	stp	x27,x28,[sp,#80]
    220 	stp	x0,x3,[sp,#96]	// offload rp and np
    221 
    222 	ldp	x6,x7,[x1,#8*0]
    223 	ldp	x8,x9,[x1,#8*2]
    224 	ldp	x10,x11,[x1,#8*4]
    225 	ldp	x12,x13,[x1,#8*6]
    226 
    227 	sub	x2,sp,x5,lsl#4
    228 	lsl	x5,x5,#3
    229 	ldr	x4,[x4]		// *n0
    230 	mov	sp,x2			// alloca
    231 	sub	x27,x5,#8*8
    232 	b	Lsqr8x_zero_start
    233 
    234 Lsqr8x_zero:
    235 	sub	x27,x27,#8*8
    236 	stp	xzr,xzr,[x2,#8*0]
    237 	stp	xzr,xzr,[x2,#8*2]
    238 	stp	xzr,xzr,[x2,#8*4]
    239 	stp	xzr,xzr,[x2,#8*6]
    240 Lsqr8x_zero_start:
    241 	stp	xzr,xzr,[x2,#8*8]
    242 	stp	xzr,xzr,[x2,#8*10]
    243 	stp	xzr,xzr,[x2,#8*12]
    244 	stp	xzr,xzr,[x2,#8*14]
    245 	add	x2,x2,#8*16
    246 	cbnz	x27,Lsqr8x_zero
    247 
    248 	add	x3,x1,x5
    249 	add	x1,x1,#8*8
    250 	mov	x19,xzr
    251 	mov	x20,xzr
    252 	mov	x21,xzr
    253 	mov	x22,xzr
    254 	mov	x23,xzr
    255 	mov	x24,xzr
    256 	mov	x25,xzr
    257 	mov	x26,xzr
    258 	mov	x2,sp
    259 	str	x4,[x29,#112]		// offload n0
    260 
    261 	// Multiply everything but a[i]*a[i]
    262 .align	4
    263 Lsqr8x_outer_loop:
    264         //                                                 a[1]a[0]	(i)
    265         //                                             a[2]a[0]
    266         //                                         a[3]a[0]
    267         //                                     a[4]a[0]
    268         //                                 a[5]a[0]
    269         //                             a[6]a[0]
    270         //                         a[7]a[0]
    271         //                                         a[2]a[1]		(ii)
    272         //                                     a[3]a[1]
    273         //                                 a[4]a[1]
    274         //                             a[5]a[1]
    275         //                         a[6]a[1]
    276         //                     a[7]a[1]
    277         //                                 a[3]a[2]			(iii)
    278         //                             a[4]a[2]
    279         //                         a[5]a[2]
    280         //                     a[6]a[2]
    281         //                 a[7]a[2]
    282         //                         a[4]a[3]				(iv)
    283         //                     a[5]a[3]
    284         //                 a[6]a[3]
    285         //             a[7]a[3]
    286         //                 a[5]a[4]					(v)
    287         //             a[6]a[4]
    288         //         a[7]a[4]
    289         //         a[6]a[5]						(vi)
    290         //     a[7]a[5]
    291         // a[7]a[6]							(vii)
    292 
    293 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
    294 	mul	x15,x8,x6
    295 	mul	x16,x9,x6
    296 	mul	x17,x10,x6
    297 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
    298 	mul	x14,x11,x6
    299 	adcs	x21,x21,x15
    300 	mul	x15,x12,x6
    301 	adcs	x22,x22,x16
    302 	mul	x16,x13,x6
    303 	adcs	x23,x23,x17
    304 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
    305 	adcs	x24,x24,x14
    306 	umulh	x14,x8,x6
    307 	adcs	x25,x25,x15
    308 	umulh	x15,x9,x6
    309 	adcs	x26,x26,x16
    310 	umulh	x16,x10,x6
    311 	stp	x19,x20,[x2],#8*2	// t[0..1]
    312 	adc	x19,xzr,xzr		// t[8]
    313 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
    314 	umulh	x17,x11,x6
    315 	adcs	x22,x22,x14
    316 	umulh	x14,x12,x6
    317 	adcs	x23,x23,x15
    318 	umulh	x15,x13,x6
    319 	adcs	x24,x24,x16
    320 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
    321 	adcs	x25,x25,x17
    322 	mul	x17,x9,x7
    323 	adcs	x26,x26,x14
    324 	mul	x14,x10,x7
    325 	adc	x19,x19,x15
    326 
    327 	mul	x15,x11,x7
    328 	adds	x22,x22,x16
    329 	mul	x16,x12,x7
    330 	adcs	x23,x23,x17
    331 	mul	x17,x13,x7
    332 	adcs	x24,x24,x14
    333 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
    334 	adcs	x25,x25,x15
    335 	umulh	x15,x9,x7
    336 	adcs	x26,x26,x16
    337 	umulh	x16,x10,x7
    338 	adcs	x19,x19,x17
    339 	umulh	x17,x11,x7
    340 	stp	x21,x22,[x2],#8*2	// t[2..3]
    341 	adc	x20,xzr,xzr		// t[9]
    342 	adds	x23,x23,x14
    343 	umulh	x14,x12,x7
    344 	adcs	x24,x24,x15
    345 	umulh	x15,x13,x7
    346 	adcs	x25,x25,x16
    347 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
    348 	adcs	x26,x26,x17
    349 	mul	x17,x10,x8
    350 	adcs	x19,x19,x14
    351 	mul	x14,x11,x8
    352 	adc	x20,x20,x15
    353 
    354 	mul	x15,x12,x8
    355 	adds	x24,x24,x16
    356 	mul	x16,x13,x8
    357 	adcs	x25,x25,x17
    358 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
    359 	adcs	x26,x26,x14
    360 	umulh	x14,x10,x8
    361 	adcs	x19,x19,x15
    362 	umulh	x15,x11,x8
    363 	adcs	x20,x20,x16
    364 	umulh	x16,x12,x8
    365 	stp	x23,x24,[x2],#8*2	// t[4..5]
    366 	adc	x21,xzr,xzr		// t[10]
    367 	adds	x25,x25,x17
    368 	umulh	x17,x13,x8
    369 	adcs	x26,x26,x14
    370 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
    371 	adcs	x19,x19,x15
    372 	mul	x15,x11,x9
    373 	adcs	x20,x20,x16
    374 	mul	x16,x12,x9
    375 	adc	x21,x21,x17
    376 
    377 	mul	x17,x13,x9
    378 	adds	x26,x26,x14
    379 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
    380 	adcs	x19,x19,x15
    381 	umulh	x15,x11,x9
    382 	adcs	x20,x20,x16
    383 	umulh	x16,x12,x9
    384 	adcs	x21,x21,x17
    385 	umulh	x17,x13,x9
    386 	stp	x25,x26,[x2],#8*2	// t[6..7]
    387 	adc	x22,xzr,xzr		// t[11]
    388 	adds	x19,x19,x14
    389 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
    390 	adcs	x20,x20,x15
    391 	mul	x15,x12,x10
    392 	adcs	x21,x21,x16
    393 	mul	x16,x13,x10
    394 	adc	x22,x22,x17
    395 
    396 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
    397 	adds	x20,x20,x14
    398 	umulh	x14,x12,x10
    399 	adcs	x21,x21,x15
    400 	umulh	x15,x13,x10
    401 	adcs	x22,x22,x16
    402 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
    403 	adc	x23,xzr,xzr		// t[12]
    404 	adds	x21,x21,x17
    405 	mul	x17,x13,x11
    406 	adcs	x22,x22,x14
    407 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
    408 	adc	x23,x23,x15
    409 
    410 	umulh	x15,x13,x11
    411 	adds	x22,x22,x16
    412 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
    413 	adcs	x23,x23,x17
    414 	umulh	x17,x13,x12		// hi(a[7]*a[6])
    415 	adc	x24,xzr,xzr		// t[13]
    416 	adds	x23,x23,x14
    417 	sub	x27,x3,x1	// done yet?
    418 	adc	x24,x24,x15
    419 
    420 	adds	x24,x24,x16
    421 	sub	x14,x3,x5	// rewinded ap
    422 	adc	x25,xzr,xzr		// t[14]
    423 	add	x25,x25,x17
    424 
    425 	cbz	x27,Lsqr8x_outer_break
    426 
    427 	mov	x4,x6
    428 	ldp	x6,x7,[x2,#8*0]
    429 	ldp	x8,x9,[x2,#8*2]
    430 	ldp	x10,x11,[x2,#8*4]
    431 	ldp	x12,x13,[x2,#8*6]
    432 	adds	x19,x19,x6
    433 	adcs	x20,x20,x7
    434 	ldp	x6,x7,[x1,#8*0]
    435 	adcs	x21,x21,x8
    436 	adcs	x22,x22,x9
    437 	ldp	x8,x9,[x1,#8*2]
    438 	adcs	x23,x23,x10
    439 	adcs	x24,x24,x11
    440 	ldp	x10,x11,[x1,#8*4]
    441 	adcs	x25,x25,x12
    442 	mov	x0,x1
    443 	adcs	x26,xzr,x13
    444 	ldp	x12,x13,[x1,#8*6]
    445 	add	x1,x1,#8*8
    446 	//adc	x28,xzr,xzr		// moved below
    447 	mov	x27,#-8*8
    448 
    449 	//                                                         a[8]a[0]
    450 	//                                                     a[9]a[0]
    451 	//                                                 a[a]a[0]
    452 	//                                             a[b]a[0]
    453 	//                                         a[c]a[0]
    454 	//                                     a[d]a[0]
    455 	//                                 a[e]a[0]
    456 	//                             a[f]a[0]
    457 	//                                                     a[8]a[1]
    458 	//                         a[f]a[1]........................
    459 	//                                                 a[8]a[2]
    460 	//                     a[f]a[2]........................
    461 	//                                             a[8]a[3]
    462 	//                 a[f]a[3]........................
    463 	//                                         a[8]a[4]
    464 	//             a[f]a[4]........................
    465 	//                                     a[8]a[5]
    466 	//         a[f]a[5]........................
    467 	//                                 a[8]a[6]
    468 	//     a[f]a[6]........................
    469 	//                             a[8]a[7]
    470 	// a[f]a[7]........................
    471 Lsqr8x_mul:
    472 	mul	x14,x6,x4
    473 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    474 	mul	x15,x7,x4
    475 	add	x27,x27,#8
    476 	mul	x16,x8,x4
    477 	mul	x17,x9,x4
    478 	adds	x19,x19,x14
    479 	mul	x14,x10,x4
    480 	adcs	x20,x20,x15
    481 	mul	x15,x11,x4
    482 	adcs	x21,x21,x16
    483 	mul	x16,x12,x4
    484 	adcs	x22,x22,x17
    485 	mul	x17,x13,x4
    486 	adcs	x23,x23,x14
    487 	umulh	x14,x6,x4
    488 	adcs	x24,x24,x15
    489 	umulh	x15,x7,x4
    490 	adcs	x25,x25,x16
    491 	umulh	x16,x8,x4
    492 	adcs	x26,x26,x17
    493 	umulh	x17,x9,x4
    494 	adc	x28,x28,xzr
    495 	str	x19,[x2],#8
    496 	adds	x19,x20,x14
    497 	umulh	x14,x10,x4
    498 	adcs	x20,x21,x15
    499 	umulh	x15,x11,x4
    500 	adcs	x21,x22,x16
    501 	umulh	x16,x12,x4
    502 	adcs	x22,x23,x17
    503 	umulh	x17,x13,x4
    504 	ldr	x4,[x0,x27]
    505 	adcs	x23,x24,x14
    506 	adcs	x24,x25,x15
    507 	adcs	x25,x26,x16
    508 	adcs	x26,x28,x17
    509 	//adc	x28,xzr,xzr		// moved above
    510 	cbnz	x27,Lsqr8x_mul
    511 					// note that carry flag is guaranteed
    512 					// to be zero at this point
    513 	cmp	x1,x3		// done yet?
    514 	b.eq	Lsqr8x_break
    515 
    516 	ldp	x6,x7,[x2,#8*0]
    517 	ldp	x8,x9,[x2,#8*2]
    518 	ldp	x10,x11,[x2,#8*4]
    519 	ldp	x12,x13,[x2,#8*6]
    520 	adds	x19,x19,x6
    521 	ldr	x4,[x0,#-8*8]
    522 	adcs	x20,x20,x7
    523 	ldp	x6,x7,[x1,#8*0]
    524 	adcs	x21,x21,x8
    525 	adcs	x22,x22,x9
    526 	ldp	x8,x9,[x1,#8*2]
    527 	adcs	x23,x23,x10
    528 	adcs	x24,x24,x11
    529 	ldp	x10,x11,[x1,#8*4]
    530 	adcs	x25,x25,x12
    531 	mov	x27,#-8*8
    532 	adcs	x26,x26,x13
    533 	ldp	x12,x13,[x1,#8*6]
    534 	add	x1,x1,#8*8
    535 	//adc	x28,xzr,xzr		// moved above
    536 	b	Lsqr8x_mul
    537 
    538 .align	4
    539 Lsqr8x_break:
    540 	ldp	x6,x7,[x0,#8*0]
    541 	add	x1,x0,#8*8
    542 	ldp	x8,x9,[x0,#8*2]
    543 	sub	x14,x3,x1		// is it last iteration?
    544 	ldp	x10,x11,[x0,#8*4]
    545 	sub	x15,x2,x14
    546 	ldp	x12,x13,[x0,#8*6]
    547 	cbz	x14,Lsqr8x_outer_loop
    548 
    549 	stp	x19,x20,[x2,#8*0]
    550 	ldp	x19,x20,[x15,#8*0]
    551 	stp	x21,x22,[x2,#8*2]
    552 	ldp	x21,x22,[x15,#8*2]
    553 	stp	x23,x24,[x2,#8*4]
    554 	ldp	x23,x24,[x15,#8*4]
    555 	stp	x25,x26,[x2,#8*6]
    556 	mov	x2,x15
    557 	ldp	x25,x26,[x15,#8*6]
    558 	b	Lsqr8x_outer_loop
    559 
    560 .align	4
    561 Lsqr8x_outer_break:
    562 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
    563 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
    564 	ldp	x15,x16,[sp,#8*1]
    565 	ldp	x11,x13,[x14,#8*2]
    566 	add	x1,x14,#8*4
    567 	ldp	x17,x14,[sp,#8*3]
    568 
    569 	stp	x19,x20,[x2,#8*0]
    570 	mul	x19,x7,x7
    571 	stp	x21,x22,[x2,#8*2]
    572 	umulh	x7,x7,x7
    573 	stp	x23,x24,[x2,#8*4]
    574 	mul	x8,x9,x9
    575 	stp	x25,x26,[x2,#8*6]
    576 	mov	x2,sp
    577 	umulh	x9,x9,x9
    578 	adds	x20,x7,x15,lsl#1
    579 	extr	x15,x16,x15,#63
    580 	sub	x27,x5,#8*4
    581 
    582 Lsqr4x_shift_n_add:
    583 	adcs	x21,x8,x15
    584 	extr	x16,x17,x16,#63
    585 	sub	x27,x27,#8*4
    586 	adcs	x22,x9,x16
    587 	ldp	x15,x16,[x2,#8*5]
    588 	mul	x10,x11,x11
    589 	ldp	x7,x9,[x1],#8*2
    590 	umulh	x11,x11,x11
    591 	mul	x12,x13,x13
    592 	umulh	x13,x13,x13
    593 	extr	x17,x14,x17,#63
    594 	stp	x19,x20,[x2,#8*0]
    595 	adcs	x23,x10,x17
    596 	extr	x14,x15,x14,#63
    597 	stp	x21,x22,[x2,#8*2]
    598 	adcs	x24,x11,x14
    599 	ldp	x17,x14,[x2,#8*7]
    600 	extr	x15,x16,x15,#63
    601 	adcs	x25,x12,x15
    602 	extr	x16,x17,x16,#63
    603 	adcs	x26,x13,x16
    604 	ldp	x15,x16,[x2,#8*9]
    605 	mul	x6,x7,x7
    606 	ldp	x11,x13,[x1],#8*2
    607 	umulh	x7,x7,x7
    608 	mul	x8,x9,x9
    609 	umulh	x9,x9,x9
    610 	stp	x23,x24,[x2,#8*4]
    611 	extr	x17,x14,x17,#63
    612 	stp	x25,x26,[x2,#8*6]
    613 	add	x2,x2,#8*8
    614 	adcs	x19,x6,x17
    615 	extr	x14,x15,x14,#63
    616 	adcs	x20,x7,x14
    617 	ldp	x17,x14,[x2,#8*3]
    618 	extr	x15,x16,x15,#63
    619 	cbnz	x27,Lsqr4x_shift_n_add
    620 	ldp	x1,x4,[x29,#104]	// pull np and n0
    621 
    622 	adcs	x21,x8,x15
    623 	extr	x16,x17,x16,#63
    624 	adcs	x22,x9,x16
    625 	ldp	x15,x16,[x2,#8*5]
    626 	mul	x10,x11,x11
    627 	umulh	x11,x11,x11
    628 	stp	x19,x20,[x2,#8*0]
    629 	mul	x12,x13,x13
    630 	umulh	x13,x13,x13
    631 	stp	x21,x22,[x2,#8*2]
    632 	extr	x17,x14,x17,#63
    633 	adcs	x23,x10,x17
    634 	extr	x14,x15,x14,#63
    635 	ldp	x19,x20,[sp,#8*0]
    636 	adcs	x24,x11,x14
    637 	extr	x15,x16,x15,#63
    638 	ldp	x6,x7,[x1,#8*0]
    639 	adcs	x25,x12,x15
    640 	extr	x16,xzr,x16,#63
    641 	ldp	x8,x9,[x1,#8*2]
    642 	adc	x26,x13,x16
    643 	ldp	x10,x11,[x1,#8*4]
    644 
    645 	// Reduce by 512 bits per iteration
    646 	mul	x28,x4,x19		// t[0]*n0
    647 	ldp	x12,x13,[x1,#8*6]
    648 	add	x3,x1,x5
    649 	ldp	x21,x22,[sp,#8*2]
    650 	stp	x23,x24,[x2,#8*4]
    651 	ldp	x23,x24,[sp,#8*4]
    652 	stp	x25,x26,[x2,#8*6]
    653 	ldp	x25,x26,[sp,#8*6]
    654 	add	x1,x1,#8*8
    655 	mov	x30,xzr		// initial top-most carry
    656 	mov	x2,sp
    657 	mov	x27,#8
    658 
    659 Lsqr8x_reduction:
    660 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
    661 	mul	x15,x7,x28
    662 	sub	x27,x27,#1
    663 	mul	x16,x8,x28
    664 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
    665 	mul	x17,x9,x28
    666 	// (*)	adds	xzr,x19,x14
    667 	subs	xzr,x19,#1		// (*)
    668 	mul	x14,x10,x28
    669 	adcs	x19,x20,x15
    670 	mul	x15,x11,x28
    671 	adcs	x20,x21,x16
    672 	mul	x16,x12,x28
    673 	adcs	x21,x22,x17
    674 	mul	x17,x13,x28
    675 	adcs	x22,x23,x14
    676 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
    677 	adcs	x23,x24,x15
    678 	umulh	x15,x7,x28
    679 	adcs	x24,x25,x16
    680 	umulh	x16,x8,x28
    681 	adcs	x25,x26,x17
    682 	umulh	x17,x9,x28
    683 	adc	x26,xzr,xzr
    684 	adds	x19,x19,x14
    685 	umulh	x14,x10,x28
    686 	adcs	x20,x20,x15
    687 	umulh	x15,x11,x28
    688 	adcs	x21,x21,x16
    689 	umulh	x16,x12,x28
    690 	adcs	x22,x22,x17
    691 	umulh	x17,x13,x28
    692 	mul	x28,x4,x19		// next t[0]*n0
    693 	adcs	x23,x23,x14
    694 	adcs	x24,x24,x15
    695 	adcs	x25,x25,x16
    696 	adc	x26,x26,x17
    697 	cbnz	x27,Lsqr8x_reduction
    698 
    699 	ldp	x14,x15,[x2,#8*0]
    700 	ldp	x16,x17,[x2,#8*2]
    701 	mov	x0,x2
    702 	sub	x27,x3,x1	// done yet?
    703 	adds	x19,x19,x14
    704 	adcs	x20,x20,x15
    705 	ldp	x14,x15,[x2,#8*4]
    706 	adcs	x21,x21,x16
    707 	adcs	x22,x22,x17
    708 	ldp	x16,x17,[x2,#8*6]
    709 	adcs	x23,x23,x14
    710 	adcs	x24,x24,x15
    711 	adcs	x25,x25,x16
    712 	adcs	x26,x26,x17
    713 	//adc	x28,xzr,xzr		// moved below
    714 	cbz	x27,Lsqr8x8_post_condition
    715 
    716 	ldr	x4,[x2,#-8*8]
    717 	ldp	x6,x7,[x1,#8*0]
    718 	ldp	x8,x9,[x1,#8*2]
    719 	ldp	x10,x11,[x1,#8*4]
    720 	mov	x27,#-8*8
    721 	ldp	x12,x13,[x1,#8*6]
    722 	add	x1,x1,#8*8
    723 
    724 Lsqr8x_tail:
    725 	mul	x14,x6,x4
    726 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
    727 	mul	x15,x7,x4
    728 	add	x27,x27,#8
    729 	mul	x16,x8,x4
    730 	mul	x17,x9,x4
    731 	adds	x19,x19,x14
    732 	mul	x14,x10,x4
    733 	adcs	x20,x20,x15
    734 	mul	x15,x11,x4
    735 	adcs	x21,x21,x16
    736 	mul	x16,x12,x4
    737 	adcs	x22,x22,x17
    738 	mul	x17,x13,x4
    739 	adcs	x23,x23,x14
    740 	umulh	x14,x6,x4
    741 	adcs	x24,x24,x15
    742 	umulh	x15,x7,x4
    743 	adcs	x25,x25,x16
    744 	umulh	x16,x8,x4
    745 	adcs	x26,x26,x17
    746 	umulh	x17,x9,x4
    747 	adc	x28,x28,xzr
    748 	str	x19,[x2],#8
    749 	adds	x19,x20,x14
    750 	umulh	x14,x10,x4
    751 	adcs	x20,x21,x15
    752 	umulh	x15,x11,x4
    753 	adcs	x21,x22,x16
    754 	umulh	x16,x12,x4
    755 	adcs	x22,x23,x17
    756 	umulh	x17,x13,x4
    757 	ldr	x4,[x0,x27]
    758 	adcs	x23,x24,x14
    759 	adcs	x24,x25,x15
    760 	adcs	x25,x26,x16
    761 	adcs	x26,x28,x17
    762 	//adc	x28,xzr,xzr		// moved above
    763 	cbnz	x27,Lsqr8x_tail
    764 					// note that carry flag is guaranteed
    765 					// to be zero at this point
    766 	ldp	x6,x7,[x2,#8*0]
    767 	sub	x27,x3,x1	// done yet?
    768 	sub	x16,x3,x5	// rewinded np
    769 	ldp	x8,x9,[x2,#8*2]
    770 	ldp	x10,x11,[x2,#8*4]
    771 	ldp	x12,x13,[x2,#8*6]
    772 	cbz	x27,Lsqr8x_tail_break
    773 
    774 	ldr	x4,[x0,#-8*8]
    775 	adds	x19,x19,x6
    776 	adcs	x20,x20,x7
    777 	ldp	x6,x7,[x1,#8*0]
    778 	adcs	x21,x21,x8
    779 	adcs	x22,x22,x9
    780 	ldp	x8,x9,[x1,#8*2]
    781 	adcs	x23,x23,x10
    782 	adcs	x24,x24,x11
    783 	ldp	x10,x11,[x1,#8*4]
    784 	adcs	x25,x25,x12
    785 	mov	x27,#-8*8
    786 	adcs	x26,x26,x13
    787 	ldp	x12,x13,[x1,#8*6]
    788 	add	x1,x1,#8*8
    789 	//adc	x28,xzr,xzr		// moved above
    790 	b	Lsqr8x_tail
    791 
    792 .align	4
    793 Lsqr8x_tail_break:
    794 	ldr	x4,[x29,#112]		// pull n0
    795 	add	x27,x2,#8*8		// end of current t[num] window
    796 
    797 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
    798 	adcs	x14,x19,x6
    799 	adcs	x15,x20,x7
    800 	ldp	x19,x20,[x0,#8*0]
    801 	adcs	x21,x21,x8
    802 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
    803 	adcs	x22,x22,x9
    804 	ldp	x8,x9,[x16,#8*2]
    805 	adcs	x23,x23,x10
    806 	adcs	x24,x24,x11
    807 	ldp	x10,x11,[x16,#8*4]
    808 	adcs	x25,x25,x12
    809 	adcs	x26,x26,x13
    810 	ldp	x12,x13,[x16,#8*6]
    811 	add	x1,x16,#8*8
    812 	adc	x30,xzr,xzr	// top-most carry
    813 	mul	x28,x4,x19
    814 	stp	x14,x15,[x2,#8*0]
    815 	stp	x21,x22,[x2,#8*2]
    816 	ldp	x21,x22,[x0,#8*2]
    817 	stp	x23,x24,[x2,#8*4]
    818 	ldp	x23,x24,[x0,#8*4]
    819 	cmp	x27,x29		// did we hit the bottom?
    820 	stp	x25,x26,[x2,#8*6]
    821 	mov	x2,x0			// slide the window
    822 	ldp	x25,x26,[x0,#8*6]
    823 	mov	x27,#8
    824 	b.ne	Lsqr8x_reduction
    825 
    826 	// Final step. We see if result is larger than modulus, and
    827 	// if it is, subtract the modulus. But comparison implies
    828 	// subtraction. So we subtract modulus, see if it borrowed,
    829 	// and conditionally copy original value.
    830 	ldr	x0,[x29,#96]		// pull rp
    831 	add	x2,x2,#8*8
    832 	subs	x14,x19,x6
    833 	sbcs	x15,x20,x7
    834 	sub	x27,x5,#8*8
    835 	mov	x3,x0		// x0 copy
    836 
    837 Lsqr8x_sub:
    838 	sbcs	x16,x21,x8
    839 	ldp	x6,x7,[x1,#8*0]
    840 	sbcs	x17,x22,x9
    841 	stp	x14,x15,[x0,#8*0]
    842 	sbcs	x14,x23,x10
    843 	ldp	x8,x9,[x1,#8*2]
    844 	sbcs	x15,x24,x11
    845 	stp	x16,x17,[x0,#8*2]
    846 	sbcs	x16,x25,x12
    847 	ldp	x10,x11,[x1,#8*4]
    848 	sbcs	x17,x26,x13
    849 	ldp	x12,x13,[x1,#8*6]
    850 	add	x1,x1,#8*8
    851 	ldp	x19,x20,[x2,#8*0]
    852 	sub	x27,x27,#8*8
    853 	ldp	x21,x22,[x2,#8*2]
    854 	ldp	x23,x24,[x2,#8*4]
    855 	ldp	x25,x26,[x2,#8*6]
    856 	add	x2,x2,#8*8
    857 	stp	x14,x15,[x0,#8*4]
    858 	sbcs	x14,x19,x6
    859 	stp	x16,x17,[x0,#8*6]
    860 	add	x0,x0,#8*8
    861 	sbcs	x15,x20,x7
    862 	cbnz	x27,Lsqr8x_sub
    863 
    864 	sbcs	x16,x21,x8
    865 	mov	x2,sp
    866 	add	x1,sp,x5
    867 	ldp	x6,x7,[x3,#8*0]
    868 	sbcs	x17,x22,x9
    869 	stp	x14,x15,[x0,#8*0]
    870 	sbcs	x14,x23,x10
    871 	ldp	x8,x9,[x3,#8*2]
    872 	sbcs	x15,x24,x11
    873 	stp	x16,x17,[x0,#8*2]
    874 	sbcs	x16,x25,x12
    875 	ldp	x19,x20,[x1,#8*0]
    876 	sbcs	x17,x26,x13
    877 	ldp	x21,x22,[x1,#8*2]
    878 	sbcs	xzr,x30,xzr	// did it borrow?
    879 	ldr	x30,[x29,#8]		// pull return address
    880 	stp	x14,x15,[x0,#8*4]
    881 	stp	x16,x17,[x0,#8*6]
    882 
    883 	sub	x27,x5,#8*4
    884 Lsqr4x_cond_copy:
    885 	sub	x27,x27,#8*4
    886 	csel	x14,x19,x6,lo
    887 	stp	xzr,xzr,[x2,#8*0]
    888 	csel	x15,x20,x7,lo
    889 	ldp	x6,x7,[x3,#8*4]
    890 	ldp	x19,x20,[x1,#8*4]
    891 	csel	x16,x21,x8,lo
    892 	stp	xzr,xzr,[x2,#8*2]
    893 	add	x2,x2,#8*4
    894 	csel	x17,x22,x9,lo
    895 	ldp	x8,x9,[x3,#8*6]
    896 	ldp	x21,x22,[x1,#8*6]
    897 	add	x1,x1,#8*4
    898 	stp	x14,x15,[x3,#8*0]
    899 	stp	x16,x17,[x3,#8*2]
    900 	add	x3,x3,#8*4
    901 	stp	xzr,xzr,[x1,#8*0]
    902 	stp	xzr,xzr,[x1,#8*2]
    903 	cbnz	x27,Lsqr4x_cond_copy
    904 
    905 	csel	x14,x19,x6,lo
    906 	stp	xzr,xzr,[x2,#8*0]
    907 	csel	x15,x20,x7,lo
    908 	stp	xzr,xzr,[x2,#8*2]
    909 	csel	x16,x21,x8,lo
    910 	csel	x17,x22,x9,lo
    911 	stp	x14,x15,[x3,#8*0]
    912 	stp	x16,x17,[x3,#8*2]
    913 
    914 	b	Lsqr8x_done
    915 
    916 .align	4
    917 Lsqr8x8_post_condition:
    918 	adc	x28,xzr,xzr
    919 	ldr	x30,[x29,#8]		// pull return address
    920 	// x19-7,x28 hold result, x6-7 hold modulus
    921 	subs	x6,x19,x6
    922 	ldr	x1,[x29,#96]		// pull rp
    923 	sbcs	x7,x20,x7
    924 	stp	xzr,xzr,[sp,#8*0]
    925 	sbcs	x8,x21,x8
    926 	stp	xzr,xzr,[sp,#8*2]
    927 	sbcs	x9,x22,x9
    928 	stp	xzr,xzr,[sp,#8*4]
    929 	sbcs	x10,x23,x10
    930 	stp	xzr,xzr,[sp,#8*6]
    931 	sbcs	x11,x24,x11
    932 	stp	xzr,xzr,[sp,#8*8]
    933 	sbcs	x12,x25,x12
    934 	stp	xzr,xzr,[sp,#8*10]
    935 	sbcs	x13,x26,x13
    936 	stp	xzr,xzr,[sp,#8*12]
    937 	sbcs	x28,x28,xzr	// did it borrow?
    938 	stp	xzr,xzr,[sp,#8*14]
    939 
    940 	// x6-7 hold result-modulus
    941 	csel	x6,x19,x6,lo
    942 	csel	x7,x20,x7,lo
    943 	csel	x8,x21,x8,lo
    944 	csel	x9,x22,x9,lo
    945 	stp	x6,x7,[x1,#8*0]
    946 	csel	x10,x23,x10,lo
    947 	csel	x11,x24,x11,lo
    948 	stp	x8,x9,[x1,#8*2]
    949 	csel	x12,x25,x12,lo
    950 	csel	x13,x26,x13,lo
    951 	stp	x10,x11,[x1,#8*4]
    952 	stp	x12,x13,[x1,#8*6]
    953 
    954 Lsqr8x_done:
    955 	ldp	x19,x20,[x29,#16]
    956 	mov	sp,x29
    957 	ldp	x21,x22,[x29,#32]
    958 	mov	x0,#1
    959 	ldp	x23,x24,[x29,#48]
    960 	ldp	x25,x26,[x29,#64]
    961 	ldp	x27,x28,[x29,#80]
    962 	ldr	x29,[sp],#128
    963 	ret
    964 
    965 
    966 .align	5
    967 __bn_mul4x_mont:
    968 	stp	x29,x30,[sp,#-128]!
    969 	add	x29,sp,#0
    970 	stp	x19,x20,[sp,#16]
    971 	stp	x21,x22,[sp,#32]
    972 	stp	x23,x24,[sp,#48]
    973 	stp	x25,x26,[sp,#64]
    974 	stp	x27,x28,[sp,#80]
    975 
    976 	sub	x26,sp,x5,lsl#3
    977 	lsl	x5,x5,#3
    978 	ldr	x4,[x4]		// *n0
    979 	sub	sp,x26,#8*4		// alloca
    980 
    981 	add	x10,x2,x5
    982 	add	x27,x1,x5
    983 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
    984 
    985 	ldr	x24,[x2,#8*0]		// b[0]
    986 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
    987 	ldp	x8,x9,[x1,#8*2]
    988 	add	x1,x1,#8*4
    989 	mov	x19,xzr
    990 	mov	x20,xzr
    991 	mov	x21,xzr
    992 	mov	x22,xzr
    993 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
    994 	ldp	x16,x17,[x3,#8*2]
    995 	adds	x3,x3,#8*4		// clear carry bit
    996 	mov	x0,xzr
    997 	mov	x28,#0
    998 	mov	x26,sp
    999 
   1000 Loop_mul4x_1st_reduction:
   1001 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
   1002 	adc	x0,x0,xzr	// modulo-scheduled
   1003 	mul	x11,x7,x24
   1004 	add	x28,x28,#8
   1005 	mul	x12,x8,x24
   1006 	and	x28,x28,#31
   1007 	mul	x13,x9,x24
   1008 	adds	x19,x19,x10
   1009 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
   1010 	adcs	x20,x20,x11
   1011 	mul	x25,x19,x4		// t[0]*n0
   1012 	adcs	x21,x21,x12
   1013 	umulh	x11,x7,x24
   1014 	adcs	x22,x22,x13
   1015 	umulh	x12,x8,x24
   1016 	adc	x23,xzr,xzr
   1017 	umulh	x13,x9,x24
   1018 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1019 	adds	x20,x20,x10
   1020 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
   1021 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1022 	adcs	x21,x21,x11
   1023 	mul	x11,x15,x25
   1024 	adcs	x22,x22,x12
   1025 	mul	x12,x16,x25
   1026 	adc	x23,x23,x13		// can't overflow
   1027 	mul	x13,x17,x25
   1028 	// (*)	adds	xzr,x19,x10
   1029 	subs	xzr,x19,#1		// (*)
   1030 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
   1031 	adcs	x19,x20,x11
   1032 	umulh	x11,x15,x25
   1033 	adcs	x20,x21,x12
   1034 	umulh	x12,x16,x25
   1035 	adcs	x21,x22,x13
   1036 	umulh	x13,x17,x25
   1037 	adcs	x22,x23,x0
   1038 	adc	x0,xzr,xzr
   1039 	adds	x19,x19,x10
   1040 	sub	x10,x27,x1
   1041 	adcs	x20,x20,x11
   1042 	adcs	x21,x21,x12
   1043 	adcs	x22,x22,x13
   1044 	//adc	x0,x0,xzr
   1045 	cbnz	x28,Loop_mul4x_1st_reduction
   1046 
   1047 	cbz	x10,Lmul4x4_post_condition
   1048 
   1049 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1050 	ldp	x8,x9,[x1,#8*2]
   1051 	add	x1,x1,#8*4
   1052 	ldr	x25,[sp]		// a[0]*n0
   1053 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1054 	ldp	x16,x17,[x3,#8*2]
   1055 	add	x3,x3,#8*4
   1056 
   1057 Loop_mul4x_1st_tail:
   1058 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
   1059 	adc	x0,x0,xzr	// modulo-scheduled
   1060 	mul	x11,x7,x24
   1061 	add	x28,x28,#8
   1062 	mul	x12,x8,x24
   1063 	and	x28,x28,#31
   1064 	mul	x13,x9,x24
   1065 	adds	x19,x19,x10
   1066 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
   1067 	adcs	x20,x20,x11
   1068 	umulh	x11,x7,x24
   1069 	adcs	x21,x21,x12
   1070 	umulh	x12,x8,x24
   1071 	adcs	x22,x22,x13
   1072 	umulh	x13,x9,x24
   1073 	adc	x23,xzr,xzr
   1074 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1075 	adds	x20,x20,x10
   1076 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
   1077 	adcs	x21,x21,x11
   1078 	mul	x11,x15,x25
   1079 	adcs	x22,x22,x12
   1080 	mul	x12,x16,x25
   1081 	adc	x23,x23,x13		// can't overflow
   1082 	mul	x13,x17,x25
   1083 	adds	x19,x19,x10
   1084 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
   1085 	adcs	x20,x20,x11
   1086 	umulh	x11,x15,x25
   1087 	adcs	x21,x21,x12
   1088 	umulh	x12,x16,x25
   1089 	adcs	x22,x22,x13
   1090 	adcs	x23,x23,x0
   1091 	umulh	x13,x17,x25
   1092 	adc	x0,xzr,xzr
   1093 	ldr	x25,[sp,x28]		// next t[0]*n0
   1094 	str	x19,[x26],#8		// result!!!
   1095 	adds	x19,x20,x10
   1096 	sub	x10,x27,x1		// done yet?
   1097 	adcs	x20,x21,x11
   1098 	adcs	x21,x22,x12
   1099 	adcs	x22,x23,x13
   1100 	//adc	x0,x0,xzr
   1101 	cbnz	x28,Loop_mul4x_1st_tail
   1102 
   1103 	sub	x11,x27,x5	// rewinded x1
   1104 	cbz	x10,Lmul4x_proceed
   1105 
   1106 	ldp	x6,x7,[x1,#8*0]
   1107 	ldp	x8,x9,[x1,#8*2]
   1108 	add	x1,x1,#8*4
   1109 	ldp	x14,x15,[x3,#8*0]
   1110 	ldp	x16,x17,[x3,#8*2]
   1111 	add	x3,x3,#8*4
   1112 	b	Loop_mul4x_1st_tail
   1113 
   1114 .align	5
   1115 Lmul4x_proceed:
   1116 	ldr	x24,[x2,#8*4]!		// *++b
   1117 	adc	x30,x0,xzr
   1118 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
   1119 	sub	x3,x3,x5		// rewind np
   1120 	ldp	x8,x9,[x11,#8*2]
   1121 	add	x1,x11,#8*4
   1122 
   1123 	stp	x19,x20,[x26,#8*0]	// result!!!
   1124 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1125 	stp	x21,x22,[x26,#8*2]	// result!!!
   1126 	ldp	x21,x22,[sp,#8*6]
   1127 
   1128 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1129 	mov	x26,sp
   1130 	ldp	x16,x17,[x3,#8*2]
   1131 	adds	x3,x3,#8*4		// clear carry bit
   1132 	mov	x0,xzr
   1133 
   1134 .align	4
   1135 Loop_mul4x_reduction:
   1136 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
   1137 	adc	x0,x0,xzr	// modulo-scheduled
   1138 	mul	x11,x7,x24
   1139 	add	x28,x28,#8
   1140 	mul	x12,x8,x24
   1141 	and	x28,x28,#31
   1142 	mul	x13,x9,x24
   1143 	adds	x19,x19,x10
   1144 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
   1145 	adcs	x20,x20,x11
   1146 	mul	x25,x19,x4		// t[0]*n0
   1147 	adcs	x21,x21,x12
   1148 	umulh	x11,x7,x24
   1149 	adcs	x22,x22,x13
   1150 	umulh	x12,x8,x24
   1151 	adc	x23,xzr,xzr
   1152 	umulh	x13,x9,x24
   1153 	ldr	x24,[x2,x28]		// next b[i]
   1154 	adds	x20,x20,x10
   1155 	// (*)	mul	x10,x14,x25
   1156 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1157 	adcs	x21,x21,x11
   1158 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
   1159 	adcs	x22,x22,x12
   1160 	mul	x12,x16,x25
   1161 	adc	x23,x23,x13		// can't overflow
   1162 	mul	x13,x17,x25
   1163 	// (*)	adds	xzr,x19,x10
   1164 	subs	xzr,x19,#1		// (*)
   1165 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
   1166 	adcs	x19,x20,x11
   1167 	umulh	x11,x15,x25
   1168 	adcs	x20,x21,x12
   1169 	umulh	x12,x16,x25
   1170 	adcs	x21,x22,x13
   1171 	umulh	x13,x17,x25
   1172 	adcs	x22,x23,x0
   1173 	adc	x0,xzr,xzr
   1174 	adds	x19,x19,x10
   1175 	adcs	x20,x20,x11
   1176 	adcs	x21,x21,x12
   1177 	adcs	x22,x22,x13
   1178 	//adc	x0,x0,xzr
   1179 	cbnz	x28,Loop_mul4x_reduction
   1180 
   1181 	adc	x0,x0,xzr
   1182 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
   1183 	ldp	x12,x13,[x26,#8*6]
   1184 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1185 	ldp	x8,x9,[x1,#8*2]
   1186 	add	x1,x1,#8*4
   1187 	adds	x19,x19,x10
   1188 	adcs	x20,x20,x11
   1189 	adcs	x21,x21,x12
   1190 	adcs	x22,x22,x13
   1191 	//adc	x0,x0,xzr
   1192 
   1193 	ldr	x25,[sp]		// t[0]*n0
   1194 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1195 	ldp	x16,x17,[x3,#8*2]
   1196 	add	x3,x3,#8*4
   1197 
   1198 .align	4
   1199 Loop_mul4x_tail:
   1200 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
   1201 	adc	x0,x0,xzr	// modulo-scheduled
   1202 	mul	x11,x7,x24
   1203 	add	x28,x28,#8
   1204 	mul	x12,x8,x24
   1205 	and	x28,x28,#31
   1206 	mul	x13,x9,x24
   1207 	adds	x19,x19,x10
   1208 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
   1209 	adcs	x20,x20,x11
   1210 	umulh	x11,x7,x24
   1211 	adcs	x21,x21,x12
   1212 	umulh	x12,x8,x24
   1213 	adcs	x22,x22,x13
   1214 	umulh	x13,x9,x24
   1215 	adc	x23,xzr,xzr
   1216 	ldr	x24,[x2,x28]		// next b[i]
   1217 	adds	x20,x20,x10
   1218 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
   1219 	adcs	x21,x21,x11
   1220 	mul	x11,x15,x25
   1221 	adcs	x22,x22,x12
   1222 	mul	x12,x16,x25
   1223 	adc	x23,x23,x13		// can't overflow
   1224 	mul	x13,x17,x25
   1225 	adds	x19,x19,x10
   1226 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
   1227 	adcs	x20,x20,x11
   1228 	umulh	x11,x15,x25
   1229 	adcs	x21,x21,x12
   1230 	umulh	x12,x16,x25
   1231 	adcs	x22,x22,x13
   1232 	umulh	x13,x17,x25
   1233 	adcs	x23,x23,x0
   1234 	ldr	x25,[sp,x28]		// next a[0]*n0
   1235 	adc	x0,xzr,xzr
   1236 	str	x19,[x26],#8		// result!!!
   1237 	adds	x19,x20,x10
   1238 	sub	x10,x27,x1		// done yet?
   1239 	adcs	x20,x21,x11
   1240 	adcs	x21,x22,x12
   1241 	adcs	x22,x23,x13
   1242 	//adc	x0,x0,xzr
   1243 	cbnz	x28,Loop_mul4x_tail
   1244 
   1245 	sub	x11,x3,x5		// rewinded np?
   1246 	adc	x0,x0,xzr
   1247 	cbz	x10,Loop_mul4x_break
   1248 
   1249 	ldp	x10,x11,[x26,#8*4]
   1250 	ldp	x12,x13,[x26,#8*6]
   1251 	ldp	x6,x7,[x1,#8*0]
   1252 	ldp	x8,x9,[x1,#8*2]
   1253 	add	x1,x1,#8*4
   1254 	adds	x19,x19,x10
   1255 	adcs	x20,x20,x11
   1256 	adcs	x21,x21,x12
   1257 	adcs	x22,x22,x13
   1258 	//adc	x0,x0,xzr
   1259 	ldp	x14,x15,[x3,#8*0]
   1260 	ldp	x16,x17,[x3,#8*2]
   1261 	add	x3,x3,#8*4
   1262 	b	Loop_mul4x_tail
   1263 
   1264 .align	4
   1265 Loop_mul4x_break:
   1266 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
   1267 	adds	x19,x19,x30
   1268 	add	x2,x2,#8*4		// bp++
   1269 	adcs	x20,x20,xzr
   1270 	sub	x1,x1,x5		// rewind ap
   1271 	adcs	x21,x21,xzr
   1272 	stp	x19,x20,[x26,#8*0]	// result!!!
   1273 	adcs	x22,x22,xzr
   1274 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1275 	adc	x30,x0,xzr
   1276 	stp	x21,x22,[x26,#8*2]	// result!!!
   1277 	cmp	x2,x13			// done yet?
   1278 	ldp	x21,x22,[sp,#8*6]
   1279 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
   1280 	ldp	x16,x17,[x11,#8*2]
   1281 	add	x3,x11,#8*4
   1282 	b.eq	Lmul4x_post
   1283 
   1284 	ldr	x24,[x2]
   1285 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1286 	ldp	x8,x9,[x1,#8*2]
   1287 	adds	x1,x1,#8*4		// clear carry bit
   1288 	mov	x0,xzr
   1289 	mov	x26,sp
   1290 	b	Loop_mul4x_reduction
   1291 
   1292 .align	4
   1293 Lmul4x_post:
   1294 	// Final step. We see if result is larger than modulus, and
   1295 	// if it is, subtract the modulus. But comparison implies
   1296 	// subtraction. So we subtract modulus, see if it borrowed,
   1297 	// and conditionally copy original value.
   1298 	mov	x0,x12
   1299 	mov	x27,x12		// x0 copy
   1300 	subs	x10,x19,x14
   1301 	add	x26,sp,#8*8
   1302 	sbcs	x11,x20,x15
   1303 	sub	x28,x5,#8*4
   1304 
   1305 Lmul4x_sub:
   1306 	sbcs	x12,x21,x16
   1307 	ldp	x14,x15,[x3,#8*0]
   1308 	sub	x28,x28,#8*4
   1309 	ldp	x19,x20,[x26,#8*0]
   1310 	sbcs	x13,x22,x17
   1311 	ldp	x16,x17,[x3,#8*2]
   1312 	add	x3,x3,#8*4
   1313 	ldp	x21,x22,[x26,#8*2]
   1314 	add	x26,x26,#8*4
   1315 	stp	x10,x11,[x0,#8*0]
   1316 	sbcs	x10,x19,x14
   1317 	stp	x12,x13,[x0,#8*2]
   1318 	add	x0,x0,#8*4
   1319 	sbcs	x11,x20,x15
   1320 	cbnz	x28,Lmul4x_sub
   1321 
   1322 	sbcs	x12,x21,x16
   1323 	mov	x26,sp
   1324 	add	x1,sp,#8*4
   1325 	ldp	x6,x7,[x27,#8*0]
   1326 	sbcs	x13,x22,x17
   1327 	stp	x10,x11,[x0,#8*0]
   1328 	ldp	x8,x9,[x27,#8*2]
   1329 	stp	x12,x13,[x0,#8*2]
   1330 	ldp	x19,x20,[x1,#8*0]
   1331 	ldp	x21,x22,[x1,#8*2]
   1332 	sbcs	xzr,x30,xzr	// did it borrow?
   1333 	ldr	x30,[x29,#8]		// pull return address
   1334 
   1335 	sub	x28,x5,#8*4
   1336 Lmul4x_cond_copy:
   1337 	sub	x28,x28,#8*4
   1338 	csel	x10,x19,x6,lo
   1339 	stp	xzr,xzr,[x26,#8*0]
   1340 	csel	x11,x20,x7,lo
   1341 	ldp	x6,x7,[x27,#8*4]
   1342 	ldp	x19,x20,[x1,#8*4]
   1343 	csel	x12,x21,x8,lo
   1344 	stp	xzr,xzr,[x26,#8*2]
   1345 	add	x26,x26,#8*4
   1346 	csel	x13,x22,x9,lo
   1347 	ldp	x8,x9,[x27,#8*6]
   1348 	ldp	x21,x22,[x1,#8*6]
   1349 	add	x1,x1,#8*4
   1350 	stp	x10,x11,[x27,#8*0]
   1351 	stp	x12,x13,[x27,#8*2]
   1352 	add	x27,x27,#8*4
   1353 	cbnz	x28,Lmul4x_cond_copy
   1354 
   1355 	csel	x10,x19,x6,lo
   1356 	stp	xzr,xzr,[x26,#8*0]
   1357 	csel	x11,x20,x7,lo
   1358 	stp	xzr,xzr,[x26,#8*2]
   1359 	csel	x12,x21,x8,lo
   1360 	stp	xzr,xzr,[x26,#8*3]
   1361 	csel	x13,x22,x9,lo
   1362 	stp	xzr,xzr,[x26,#8*4]
   1363 	stp	x10,x11,[x27,#8*0]
   1364 	stp	x12,x13,[x27,#8*2]
   1365 
   1366 	b	Lmul4x_done
   1367 
   1368 .align	4
   1369 Lmul4x4_post_condition:
   1370 	adc	x0,x0,xzr
   1371 	ldr	x1,[x29,#96]		// pull rp
   1372 	// x19-3,x0 hold result, x14-7 hold modulus
   1373 	subs	x6,x19,x14
   1374 	ldr	x30,[x29,#8]		// pull return address
   1375 	sbcs	x7,x20,x15
   1376 	stp	xzr,xzr,[sp,#8*0]
   1377 	sbcs	x8,x21,x16
   1378 	stp	xzr,xzr,[sp,#8*2]
   1379 	sbcs	x9,x22,x17
   1380 	stp	xzr,xzr,[sp,#8*4]
   1381 	sbcs	xzr,x0,xzr		// did it borrow?
   1382 	stp	xzr,xzr,[sp,#8*6]
   1383 
   1384 	// x6-3 hold result-modulus
   1385 	csel	x6,x19,x6,lo
   1386 	csel	x7,x20,x7,lo
   1387 	csel	x8,x21,x8,lo
   1388 	csel	x9,x22,x9,lo
   1389 	stp	x6,x7,[x1,#8*0]
   1390 	stp	x8,x9,[x1,#8*2]
   1391 
   1392 Lmul4x_done:
   1393 	ldp	x19,x20,[x29,#16]
   1394 	mov	sp,x29
   1395 	ldp	x21,x22,[x29,#32]
   1396 	mov	x0,#1
   1397 	ldp	x23,x24,[x29,#48]
   1398 	ldp	x25,x26,[x29,#64]
   1399 	ldp	x27,x28,[x29,#80]
   1400 	ldr	x29,[sp],#128
   1401 	ret
   1402 
   1403 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1404 .align	2
   1405 .align	4
   1406