1 // This file is generated from a similarly-named Perl script in the BoringSSL 2 // source tree. Do not edit by hand. 3 4 #if defined(__has_feature) 5 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6 #define OPENSSL_NO_ASM 7 #endif 8 #endif 9 10 #if !defined(OPENSSL_NO_ASM) 11 #if defined(BORINGSSL_PREFIX) 12 #include <boringssl_prefix_symbols_asm.h> 13 #endif 14 .text 15 16 .globl _bn_mul_mont 17 .private_extern _bn_mul_mont 18 19 .align 5 20 _bn_mul_mont: 21 tst x5,#7 22 b.eq __bn_sqr8x_mont 23 tst x5,#3 24 b.eq __bn_mul4x_mont 25 Lmul_mont: 26 stp x29,x30,[sp,#-64]! 27 add x29,sp,#0 28 stp x19,x20,[sp,#16] 29 stp x21,x22,[sp,#32] 30 stp x23,x24,[sp,#48] 31 32 ldr x9,[x2],#8 // bp[0] 33 sub x22,sp,x5,lsl#3 34 ldp x7,x8,[x1],#16 // ap[0..1] 35 lsl x5,x5,#3 36 ldr x4,[x4] // *n0 37 and x22,x22,#-16 // ABI says so 38 ldp x13,x14,[x3],#16 // np[0..1] 39 40 mul x6,x7,x9 // ap[0]*bp[0] 41 sub x21,x5,#16 // j=num-2 42 umulh x7,x7,x9 43 mul x10,x8,x9 // ap[1]*bp[0] 44 umulh x11,x8,x9 45 46 mul x15,x6,x4 // "tp[0]"*n0 47 mov sp,x22 // alloca 48 49 // (*) mul x12,x13,x15 // np[0]*m1 50 umulh x13,x13,x15 51 mul x16,x14,x15 // np[1]*m1 52 // (*) adds x12,x12,x6 // discarded 53 // (*) As for removal of first multiplication and addition 54 // instructions. The outcome of first addition is 55 // guaranteed to be zero, which leaves two computationally 56 // significant outcomes: it either carries or not. Then 57 // question is when does it carry? Is there alternative 58 // way to deduce it? If you follow operations, you can 59 // observe that condition for carry is quite simple: 60 // x6 being non-zero. So that carry can be calculated 61 // by adding -1 to x6. That's what next instruction does. 62 subs xzr,x6,#1 // (*) 63 umulh x17,x14,x15 64 adc x13,x13,xzr 65 cbz x21,L1st_skip 66 67 L1st: 68 ldr x8,[x1],#8 69 adds x6,x10,x7 70 sub x21,x21,#8 // j-- 71 adc x7,x11,xzr 72 73 ldr x14,[x3],#8 74 adds x12,x16,x13 75 mul x10,x8,x9 // ap[j]*bp[0] 76 adc x13,x17,xzr 77 umulh x11,x8,x9 78 79 adds x12,x12,x6 80 mul x16,x14,x15 // np[j]*m1 81 adc x13,x13,xzr 82 umulh x17,x14,x15 83 str x12,[x22],#8 // tp[j-1] 84 cbnz x21,L1st 85 86 L1st_skip: 87 adds x6,x10,x7 88 sub x1,x1,x5 // rewind x1 89 adc x7,x11,xzr 90 91 adds x12,x16,x13 92 sub x3,x3,x5 // rewind x3 93 adc x13,x17,xzr 94 95 adds x12,x12,x6 96 sub x20,x5,#8 // i=num-1 97 adcs x13,x13,x7 98 99 adc x19,xzr,xzr // upmost overflow bit 100 stp x12,x13,[x22] 101 102 Louter: 103 ldr x9,[x2],#8 // bp[i] 104 ldp x7,x8,[x1],#16 105 ldr x23,[sp] // tp[0] 106 add x22,sp,#8 107 108 mul x6,x7,x9 // ap[0]*bp[i] 109 sub x21,x5,#16 // j=num-2 110 umulh x7,x7,x9 111 ldp x13,x14,[x3],#16 112 mul x10,x8,x9 // ap[1]*bp[i] 113 adds x6,x6,x23 114 umulh x11,x8,x9 115 adc x7,x7,xzr 116 117 mul x15,x6,x4 118 sub x20,x20,#8 // i-- 119 120 // (*) mul x12,x13,x15 // np[0]*m1 121 umulh x13,x13,x15 122 mul x16,x14,x15 // np[1]*m1 123 // (*) adds x12,x12,x6 124 subs xzr,x6,#1 // (*) 125 umulh x17,x14,x15 126 cbz x21,Linner_skip 127 128 Linner: 129 ldr x8,[x1],#8 130 adc x13,x13,xzr 131 ldr x23,[x22],#8 // tp[j] 132 adds x6,x10,x7 133 sub x21,x21,#8 // j-- 134 adc x7,x11,xzr 135 136 adds x12,x16,x13 137 ldr x14,[x3],#8 138 adc x13,x17,xzr 139 140 mul x10,x8,x9 // ap[j]*bp[i] 141 adds x6,x6,x23 142 umulh x11,x8,x9 143 adc x7,x7,xzr 144 145 mul x16,x14,x15 // np[j]*m1 146 adds x12,x12,x6 147 umulh x17,x14,x15 148 str x12,[x22,#-16] // tp[j-1] 149 cbnz x21,Linner 150 151 Linner_skip: 152 ldr x23,[x22],#8 // tp[j] 153 adc x13,x13,xzr 154 adds x6,x10,x7 155 sub x1,x1,x5 // rewind x1 156 adc x7,x11,xzr 157 158 adds x12,x16,x13 159 sub x3,x3,x5 // rewind x3 160 adcs x13,x17,x19 161 adc x19,xzr,xzr 162 163 adds x6,x6,x23 164 adc x7,x7,xzr 165 166 adds x12,x12,x6 167 adcs x13,x13,x7 168 adc x19,x19,xzr // upmost overflow bit 169 stp x12,x13,[x22,#-16] 170 171 cbnz x20,Louter 172 173 // Final step. We see if result is larger than modulus, and 174 // if it is, subtract the modulus. But comparison implies 175 // subtraction. So we subtract modulus, see if it borrowed, 176 // and conditionally copy original value. 177 ldr x23,[sp] // tp[0] 178 add x22,sp,#8 179 ldr x14,[x3],#8 // np[0] 180 subs x21,x5,#8 // j=num-1 and clear borrow 181 mov x1,x0 182 Lsub: 183 sbcs x8,x23,x14 // tp[j]-np[j] 184 ldr x23,[x22],#8 185 sub x21,x21,#8 // j-- 186 ldr x14,[x3],#8 187 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 188 cbnz x21,Lsub 189 190 sbcs x8,x23,x14 191 sbcs x19,x19,xzr // did it borrow? 192 str x8,[x1],#8 // rp[num-1] 193 194 ldr x23,[sp] // tp[0] 195 add x22,sp,#8 196 ldr x8,[x0],#8 // rp[0] 197 sub x5,x5,#8 // num-- 198 nop 199 Lcond_copy: 200 sub x5,x5,#8 // num-- 201 csel x14,x23,x8,lo // did it borrow? 202 ldr x23,[x22],#8 203 ldr x8,[x0],#8 204 str xzr,[x22,#-16] // wipe tp 205 str x14,[x0,#-16] 206 cbnz x5,Lcond_copy 207 208 csel x14,x23,x8,lo 209 str xzr,[x22,#-8] // wipe tp 210 str x14,[x0,#-8] 211 212 ldp x19,x20,[x29,#16] 213 mov sp,x29 214 ldp x21,x22,[x29,#32] 215 mov x0,#1 216 ldp x23,x24,[x29,#48] 217 ldr x29,[sp],#64 218 ret 219 220 221 .align 5 222 __bn_sqr8x_mont: 223 cmp x1,x2 224 b.ne __bn_mul4x_mont 225 Lsqr8x_mont: 226 stp x29,x30,[sp,#-128]! 227 add x29,sp,#0 228 stp x19,x20,[sp,#16] 229 stp x21,x22,[sp,#32] 230 stp x23,x24,[sp,#48] 231 stp x25,x26,[sp,#64] 232 stp x27,x28,[sp,#80] 233 stp x0,x3,[sp,#96] // offload rp and np 234 235 ldp x6,x7,[x1,#8*0] 236 ldp x8,x9,[x1,#8*2] 237 ldp x10,x11,[x1,#8*4] 238 ldp x12,x13,[x1,#8*6] 239 240 sub x2,sp,x5,lsl#4 241 lsl x5,x5,#3 242 ldr x4,[x4] // *n0 243 mov sp,x2 // alloca 244 sub x27,x5,#8*8 245 b Lsqr8x_zero_start 246 247 Lsqr8x_zero: 248 sub x27,x27,#8*8 249 stp xzr,xzr,[x2,#8*0] 250 stp xzr,xzr,[x2,#8*2] 251 stp xzr,xzr,[x2,#8*4] 252 stp xzr,xzr,[x2,#8*6] 253 Lsqr8x_zero_start: 254 stp xzr,xzr,[x2,#8*8] 255 stp xzr,xzr,[x2,#8*10] 256 stp xzr,xzr,[x2,#8*12] 257 stp xzr,xzr,[x2,#8*14] 258 add x2,x2,#8*16 259 cbnz x27,Lsqr8x_zero 260 261 add x3,x1,x5 262 add x1,x1,#8*8 263 mov x19,xzr 264 mov x20,xzr 265 mov x21,xzr 266 mov x22,xzr 267 mov x23,xzr 268 mov x24,xzr 269 mov x25,xzr 270 mov x26,xzr 271 mov x2,sp 272 str x4,[x29,#112] // offload n0 273 274 // Multiply everything but a[i]*a[i] 275 .align 4 276 Lsqr8x_outer_loop: 277 // a[1]a[0] (i) 278 // a[2]a[0] 279 // a[3]a[0] 280 // a[4]a[0] 281 // a[5]a[0] 282 // a[6]a[0] 283 // a[7]a[0] 284 // a[2]a[1] (ii) 285 // a[3]a[1] 286 // a[4]a[1] 287 // a[5]a[1] 288 // a[6]a[1] 289 // a[7]a[1] 290 // a[3]a[2] (iii) 291 // a[4]a[2] 292 // a[5]a[2] 293 // a[6]a[2] 294 // a[7]a[2] 295 // a[4]a[3] (iv) 296 // a[5]a[3] 297 // a[6]a[3] 298 // a[7]a[3] 299 // a[5]a[4] (v) 300 // a[6]a[4] 301 // a[7]a[4] 302 // a[6]a[5] (vi) 303 // a[7]a[5] 304 // a[7]a[6] (vii) 305 306 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 307 mul x15,x8,x6 308 mul x16,x9,x6 309 mul x17,x10,x6 310 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 311 mul x14,x11,x6 312 adcs x21,x21,x15 313 mul x15,x12,x6 314 adcs x22,x22,x16 315 mul x16,x13,x6 316 adcs x23,x23,x17 317 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 318 adcs x24,x24,x14 319 umulh x14,x8,x6 320 adcs x25,x25,x15 321 umulh x15,x9,x6 322 adcs x26,x26,x16 323 umulh x16,x10,x6 324 stp x19,x20,[x2],#8*2 // t[0..1] 325 adc x19,xzr,xzr // t[8] 326 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 327 umulh x17,x11,x6 328 adcs x22,x22,x14 329 umulh x14,x12,x6 330 adcs x23,x23,x15 331 umulh x15,x13,x6 332 adcs x24,x24,x16 333 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 334 adcs x25,x25,x17 335 mul x17,x9,x7 336 adcs x26,x26,x14 337 mul x14,x10,x7 338 adc x19,x19,x15 339 340 mul x15,x11,x7 341 adds x22,x22,x16 342 mul x16,x12,x7 343 adcs x23,x23,x17 344 mul x17,x13,x7 345 adcs x24,x24,x14 346 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 347 adcs x25,x25,x15 348 umulh x15,x9,x7 349 adcs x26,x26,x16 350 umulh x16,x10,x7 351 adcs x19,x19,x17 352 umulh x17,x11,x7 353 stp x21,x22,[x2],#8*2 // t[2..3] 354 adc x20,xzr,xzr // t[9] 355 adds x23,x23,x14 356 umulh x14,x12,x7 357 adcs x24,x24,x15 358 umulh x15,x13,x7 359 adcs x25,x25,x16 360 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 361 adcs x26,x26,x17 362 mul x17,x10,x8 363 adcs x19,x19,x14 364 mul x14,x11,x8 365 adc x20,x20,x15 366 367 mul x15,x12,x8 368 adds x24,x24,x16 369 mul x16,x13,x8 370 adcs x25,x25,x17 371 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 372 adcs x26,x26,x14 373 umulh x14,x10,x8 374 adcs x19,x19,x15 375 umulh x15,x11,x8 376 adcs x20,x20,x16 377 umulh x16,x12,x8 378 stp x23,x24,[x2],#8*2 // t[4..5] 379 adc x21,xzr,xzr // t[10] 380 adds x25,x25,x17 381 umulh x17,x13,x8 382 adcs x26,x26,x14 383 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 384 adcs x19,x19,x15 385 mul x15,x11,x9 386 adcs x20,x20,x16 387 mul x16,x12,x9 388 adc x21,x21,x17 389 390 mul x17,x13,x9 391 adds x26,x26,x14 392 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 393 adcs x19,x19,x15 394 umulh x15,x11,x9 395 adcs x20,x20,x16 396 umulh x16,x12,x9 397 adcs x21,x21,x17 398 umulh x17,x13,x9 399 stp x25,x26,[x2],#8*2 // t[6..7] 400 adc x22,xzr,xzr // t[11] 401 adds x19,x19,x14 402 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 403 adcs x20,x20,x15 404 mul x15,x12,x10 405 adcs x21,x21,x16 406 mul x16,x13,x10 407 adc x22,x22,x17 408 409 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 410 adds x20,x20,x14 411 umulh x14,x12,x10 412 adcs x21,x21,x15 413 umulh x15,x13,x10 414 adcs x22,x22,x16 415 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 416 adc x23,xzr,xzr // t[12] 417 adds x21,x21,x17 418 mul x17,x13,x11 419 adcs x22,x22,x14 420 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 421 adc x23,x23,x15 422 423 umulh x15,x13,x11 424 adds x22,x22,x16 425 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 426 adcs x23,x23,x17 427 umulh x17,x13,x12 // hi(a[7]*a[6]) 428 adc x24,xzr,xzr // t[13] 429 adds x23,x23,x14 430 sub x27,x3,x1 // done yet? 431 adc x24,x24,x15 432 433 adds x24,x24,x16 434 sub x14,x3,x5 // rewinded ap 435 adc x25,xzr,xzr // t[14] 436 add x25,x25,x17 437 438 cbz x27,Lsqr8x_outer_break 439 440 mov x4,x6 441 ldp x6,x7,[x2,#8*0] 442 ldp x8,x9,[x2,#8*2] 443 ldp x10,x11,[x2,#8*4] 444 ldp x12,x13,[x2,#8*6] 445 adds x19,x19,x6 446 adcs x20,x20,x7 447 ldp x6,x7,[x1,#8*0] 448 adcs x21,x21,x8 449 adcs x22,x22,x9 450 ldp x8,x9,[x1,#8*2] 451 adcs x23,x23,x10 452 adcs x24,x24,x11 453 ldp x10,x11,[x1,#8*4] 454 adcs x25,x25,x12 455 mov x0,x1 456 adcs x26,xzr,x13 457 ldp x12,x13,[x1,#8*6] 458 add x1,x1,#8*8 459 //adc x28,xzr,xzr // moved below 460 mov x27,#-8*8 461 462 // a[8]a[0] 463 // a[9]a[0] 464 // a[a]a[0] 465 // a[b]a[0] 466 // a[c]a[0] 467 // a[d]a[0] 468 // a[e]a[0] 469 // a[f]a[0] 470 // a[8]a[1] 471 // a[f]a[1]........................ 472 // a[8]a[2] 473 // a[f]a[2]........................ 474 // a[8]a[3] 475 // a[f]a[3]........................ 476 // a[8]a[4] 477 // a[f]a[4]........................ 478 // a[8]a[5] 479 // a[f]a[5]........................ 480 // a[8]a[6] 481 // a[f]a[6]........................ 482 // a[8]a[7] 483 // a[f]a[7]........................ 484 Lsqr8x_mul: 485 mul x14,x6,x4 486 adc x28,xzr,xzr // carry bit, modulo-scheduled 487 mul x15,x7,x4 488 add x27,x27,#8 489 mul x16,x8,x4 490 mul x17,x9,x4 491 adds x19,x19,x14 492 mul x14,x10,x4 493 adcs x20,x20,x15 494 mul x15,x11,x4 495 adcs x21,x21,x16 496 mul x16,x12,x4 497 adcs x22,x22,x17 498 mul x17,x13,x4 499 adcs x23,x23,x14 500 umulh x14,x6,x4 501 adcs x24,x24,x15 502 umulh x15,x7,x4 503 adcs x25,x25,x16 504 umulh x16,x8,x4 505 adcs x26,x26,x17 506 umulh x17,x9,x4 507 adc x28,x28,xzr 508 str x19,[x2],#8 509 adds x19,x20,x14 510 umulh x14,x10,x4 511 adcs x20,x21,x15 512 umulh x15,x11,x4 513 adcs x21,x22,x16 514 umulh x16,x12,x4 515 adcs x22,x23,x17 516 umulh x17,x13,x4 517 ldr x4,[x0,x27] 518 adcs x23,x24,x14 519 adcs x24,x25,x15 520 adcs x25,x26,x16 521 adcs x26,x28,x17 522 //adc x28,xzr,xzr // moved above 523 cbnz x27,Lsqr8x_mul 524 // note that carry flag is guaranteed 525 // to be zero at this point 526 cmp x1,x3 // done yet? 527 b.eq Lsqr8x_break 528 529 ldp x6,x7,[x2,#8*0] 530 ldp x8,x9,[x2,#8*2] 531 ldp x10,x11,[x2,#8*4] 532 ldp x12,x13,[x2,#8*6] 533 adds x19,x19,x6 534 ldr x4,[x0,#-8*8] 535 adcs x20,x20,x7 536 ldp x6,x7,[x1,#8*0] 537 adcs x21,x21,x8 538 adcs x22,x22,x9 539 ldp x8,x9,[x1,#8*2] 540 adcs x23,x23,x10 541 adcs x24,x24,x11 542 ldp x10,x11,[x1,#8*4] 543 adcs x25,x25,x12 544 mov x27,#-8*8 545 adcs x26,x26,x13 546 ldp x12,x13,[x1,#8*6] 547 add x1,x1,#8*8 548 //adc x28,xzr,xzr // moved above 549 b Lsqr8x_mul 550 551 .align 4 552 Lsqr8x_break: 553 ldp x6,x7,[x0,#8*0] 554 add x1,x0,#8*8 555 ldp x8,x9,[x0,#8*2] 556 sub x14,x3,x1 // is it last iteration? 557 ldp x10,x11,[x0,#8*4] 558 sub x15,x2,x14 559 ldp x12,x13,[x0,#8*6] 560 cbz x14,Lsqr8x_outer_loop 561 562 stp x19,x20,[x2,#8*0] 563 ldp x19,x20,[x15,#8*0] 564 stp x21,x22,[x2,#8*2] 565 ldp x21,x22,[x15,#8*2] 566 stp x23,x24,[x2,#8*4] 567 ldp x23,x24,[x15,#8*4] 568 stp x25,x26,[x2,#8*6] 569 mov x2,x15 570 ldp x25,x26,[x15,#8*6] 571 b Lsqr8x_outer_loop 572 573 .align 4 574 Lsqr8x_outer_break: 575 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 576 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 577 ldp x15,x16,[sp,#8*1] 578 ldp x11,x13,[x14,#8*2] 579 add x1,x14,#8*4 580 ldp x17,x14,[sp,#8*3] 581 582 stp x19,x20,[x2,#8*0] 583 mul x19,x7,x7 584 stp x21,x22,[x2,#8*2] 585 umulh x7,x7,x7 586 stp x23,x24,[x2,#8*4] 587 mul x8,x9,x9 588 stp x25,x26,[x2,#8*6] 589 mov x2,sp 590 umulh x9,x9,x9 591 adds x20,x7,x15,lsl#1 592 extr x15,x16,x15,#63 593 sub x27,x5,#8*4 594 595 Lsqr4x_shift_n_add: 596 adcs x21,x8,x15 597 extr x16,x17,x16,#63 598 sub x27,x27,#8*4 599 adcs x22,x9,x16 600 ldp x15,x16,[x2,#8*5] 601 mul x10,x11,x11 602 ldp x7,x9,[x1],#8*2 603 umulh x11,x11,x11 604 mul x12,x13,x13 605 umulh x13,x13,x13 606 extr x17,x14,x17,#63 607 stp x19,x20,[x2,#8*0] 608 adcs x23,x10,x17 609 extr x14,x15,x14,#63 610 stp x21,x22,[x2,#8*2] 611 adcs x24,x11,x14 612 ldp x17,x14,[x2,#8*7] 613 extr x15,x16,x15,#63 614 adcs x25,x12,x15 615 extr x16,x17,x16,#63 616 adcs x26,x13,x16 617 ldp x15,x16,[x2,#8*9] 618 mul x6,x7,x7 619 ldp x11,x13,[x1],#8*2 620 umulh x7,x7,x7 621 mul x8,x9,x9 622 umulh x9,x9,x9 623 stp x23,x24,[x2,#8*4] 624 extr x17,x14,x17,#63 625 stp x25,x26,[x2,#8*6] 626 add x2,x2,#8*8 627 adcs x19,x6,x17 628 extr x14,x15,x14,#63 629 adcs x20,x7,x14 630 ldp x17,x14,[x2,#8*3] 631 extr x15,x16,x15,#63 632 cbnz x27,Lsqr4x_shift_n_add 633 ldp x1,x4,[x29,#104] // pull np and n0 634 635 adcs x21,x8,x15 636 extr x16,x17,x16,#63 637 adcs x22,x9,x16 638 ldp x15,x16,[x2,#8*5] 639 mul x10,x11,x11 640 umulh x11,x11,x11 641 stp x19,x20,[x2,#8*0] 642 mul x12,x13,x13 643 umulh x13,x13,x13 644 stp x21,x22,[x2,#8*2] 645 extr x17,x14,x17,#63 646 adcs x23,x10,x17 647 extr x14,x15,x14,#63 648 ldp x19,x20,[sp,#8*0] 649 adcs x24,x11,x14 650 extr x15,x16,x15,#63 651 ldp x6,x7,[x1,#8*0] 652 adcs x25,x12,x15 653 extr x16,xzr,x16,#63 654 ldp x8,x9,[x1,#8*2] 655 adc x26,x13,x16 656 ldp x10,x11,[x1,#8*4] 657 658 // Reduce by 512 bits per iteration 659 mul x28,x4,x19 // t[0]*n0 660 ldp x12,x13,[x1,#8*6] 661 add x3,x1,x5 662 ldp x21,x22,[sp,#8*2] 663 stp x23,x24,[x2,#8*4] 664 ldp x23,x24,[sp,#8*4] 665 stp x25,x26,[x2,#8*6] 666 ldp x25,x26,[sp,#8*6] 667 add x1,x1,#8*8 668 mov x30,xzr // initial top-most carry 669 mov x2,sp 670 mov x27,#8 671 672 Lsqr8x_reduction: 673 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 674 mul x15,x7,x28 675 sub x27,x27,#1 676 mul x16,x8,x28 677 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 678 mul x17,x9,x28 679 // (*) adds xzr,x19,x14 680 subs xzr,x19,#1 // (*) 681 mul x14,x10,x28 682 adcs x19,x20,x15 683 mul x15,x11,x28 684 adcs x20,x21,x16 685 mul x16,x12,x28 686 adcs x21,x22,x17 687 mul x17,x13,x28 688 adcs x22,x23,x14 689 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 690 adcs x23,x24,x15 691 umulh x15,x7,x28 692 adcs x24,x25,x16 693 umulh x16,x8,x28 694 adcs x25,x26,x17 695 umulh x17,x9,x28 696 adc x26,xzr,xzr 697 adds x19,x19,x14 698 umulh x14,x10,x28 699 adcs x20,x20,x15 700 umulh x15,x11,x28 701 adcs x21,x21,x16 702 umulh x16,x12,x28 703 adcs x22,x22,x17 704 umulh x17,x13,x28 705 mul x28,x4,x19 // next t[0]*n0 706 adcs x23,x23,x14 707 adcs x24,x24,x15 708 adcs x25,x25,x16 709 adc x26,x26,x17 710 cbnz x27,Lsqr8x_reduction 711 712 ldp x14,x15,[x2,#8*0] 713 ldp x16,x17,[x2,#8*2] 714 mov x0,x2 715 sub x27,x3,x1 // done yet? 716 adds x19,x19,x14 717 adcs x20,x20,x15 718 ldp x14,x15,[x2,#8*4] 719 adcs x21,x21,x16 720 adcs x22,x22,x17 721 ldp x16,x17,[x2,#8*6] 722 adcs x23,x23,x14 723 adcs x24,x24,x15 724 adcs x25,x25,x16 725 adcs x26,x26,x17 726 //adc x28,xzr,xzr // moved below 727 cbz x27,Lsqr8x8_post_condition 728 729 ldr x4,[x2,#-8*8] 730 ldp x6,x7,[x1,#8*0] 731 ldp x8,x9,[x1,#8*2] 732 ldp x10,x11,[x1,#8*4] 733 mov x27,#-8*8 734 ldp x12,x13,[x1,#8*6] 735 add x1,x1,#8*8 736 737 Lsqr8x_tail: 738 mul x14,x6,x4 739 adc x28,xzr,xzr // carry bit, modulo-scheduled 740 mul x15,x7,x4 741 add x27,x27,#8 742 mul x16,x8,x4 743 mul x17,x9,x4 744 adds x19,x19,x14 745 mul x14,x10,x4 746 adcs x20,x20,x15 747 mul x15,x11,x4 748 adcs x21,x21,x16 749 mul x16,x12,x4 750 adcs x22,x22,x17 751 mul x17,x13,x4 752 adcs x23,x23,x14 753 umulh x14,x6,x4 754 adcs x24,x24,x15 755 umulh x15,x7,x4 756 adcs x25,x25,x16 757 umulh x16,x8,x4 758 adcs x26,x26,x17 759 umulh x17,x9,x4 760 adc x28,x28,xzr 761 str x19,[x2],#8 762 adds x19,x20,x14 763 umulh x14,x10,x4 764 adcs x20,x21,x15 765 umulh x15,x11,x4 766 adcs x21,x22,x16 767 umulh x16,x12,x4 768 adcs x22,x23,x17 769 umulh x17,x13,x4 770 ldr x4,[x0,x27] 771 adcs x23,x24,x14 772 adcs x24,x25,x15 773 adcs x25,x26,x16 774 adcs x26,x28,x17 775 //adc x28,xzr,xzr // moved above 776 cbnz x27,Lsqr8x_tail 777 // note that carry flag is guaranteed 778 // to be zero at this point 779 ldp x6,x7,[x2,#8*0] 780 sub x27,x3,x1 // done yet? 781 sub x16,x3,x5 // rewinded np 782 ldp x8,x9,[x2,#8*2] 783 ldp x10,x11,[x2,#8*4] 784 ldp x12,x13,[x2,#8*6] 785 cbz x27,Lsqr8x_tail_break 786 787 ldr x4,[x0,#-8*8] 788 adds x19,x19,x6 789 adcs x20,x20,x7 790 ldp x6,x7,[x1,#8*0] 791 adcs x21,x21,x8 792 adcs x22,x22,x9 793 ldp x8,x9,[x1,#8*2] 794 adcs x23,x23,x10 795 adcs x24,x24,x11 796 ldp x10,x11,[x1,#8*4] 797 adcs x25,x25,x12 798 mov x27,#-8*8 799 adcs x26,x26,x13 800 ldp x12,x13,[x1,#8*6] 801 add x1,x1,#8*8 802 //adc x28,xzr,xzr // moved above 803 b Lsqr8x_tail 804 805 .align 4 806 Lsqr8x_tail_break: 807 ldr x4,[x29,#112] // pull n0 808 add x27,x2,#8*8 // end of current t[num] window 809 810 subs xzr,x30,#1 // "move" top-most carry to carry bit 811 adcs x14,x19,x6 812 adcs x15,x20,x7 813 ldp x19,x20,[x0,#8*0] 814 adcs x21,x21,x8 815 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 816 adcs x22,x22,x9 817 ldp x8,x9,[x16,#8*2] 818 adcs x23,x23,x10 819 adcs x24,x24,x11 820 ldp x10,x11,[x16,#8*4] 821 adcs x25,x25,x12 822 adcs x26,x26,x13 823 ldp x12,x13,[x16,#8*6] 824 add x1,x16,#8*8 825 adc x30,xzr,xzr // top-most carry 826 mul x28,x4,x19 827 stp x14,x15,[x2,#8*0] 828 stp x21,x22,[x2,#8*2] 829 ldp x21,x22,[x0,#8*2] 830 stp x23,x24,[x2,#8*4] 831 ldp x23,x24,[x0,#8*4] 832 cmp x27,x29 // did we hit the bottom? 833 stp x25,x26,[x2,#8*6] 834 mov x2,x0 // slide the window 835 ldp x25,x26,[x0,#8*6] 836 mov x27,#8 837 b.ne Lsqr8x_reduction 838 839 // Final step. We see if result is larger than modulus, and 840 // if it is, subtract the modulus. But comparison implies 841 // subtraction. So we subtract modulus, see if it borrowed, 842 // and conditionally copy original value. 843 ldr x0,[x29,#96] // pull rp 844 add x2,x2,#8*8 845 subs x14,x19,x6 846 sbcs x15,x20,x7 847 sub x27,x5,#8*8 848 mov x3,x0 // x0 copy 849 850 Lsqr8x_sub: 851 sbcs x16,x21,x8 852 ldp x6,x7,[x1,#8*0] 853 sbcs x17,x22,x9 854 stp x14,x15,[x0,#8*0] 855 sbcs x14,x23,x10 856 ldp x8,x9,[x1,#8*2] 857 sbcs x15,x24,x11 858 stp x16,x17,[x0,#8*2] 859 sbcs x16,x25,x12 860 ldp x10,x11,[x1,#8*4] 861 sbcs x17,x26,x13 862 ldp x12,x13,[x1,#8*6] 863 add x1,x1,#8*8 864 ldp x19,x20,[x2,#8*0] 865 sub x27,x27,#8*8 866 ldp x21,x22,[x2,#8*2] 867 ldp x23,x24,[x2,#8*4] 868 ldp x25,x26,[x2,#8*6] 869 add x2,x2,#8*8 870 stp x14,x15,[x0,#8*4] 871 sbcs x14,x19,x6 872 stp x16,x17,[x0,#8*6] 873 add x0,x0,#8*8 874 sbcs x15,x20,x7 875 cbnz x27,Lsqr8x_sub 876 877 sbcs x16,x21,x8 878 mov x2,sp 879 add x1,sp,x5 880 ldp x6,x7,[x3,#8*0] 881 sbcs x17,x22,x9 882 stp x14,x15,[x0,#8*0] 883 sbcs x14,x23,x10 884 ldp x8,x9,[x3,#8*2] 885 sbcs x15,x24,x11 886 stp x16,x17,[x0,#8*2] 887 sbcs x16,x25,x12 888 ldp x19,x20,[x1,#8*0] 889 sbcs x17,x26,x13 890 ldp x21,x22,[x1,#8*2] 891 sbcs xzr,x30,xzr // did it borrow? 892 ldr x30,[x29,#8] // pull return address 893 stp x14,x15,[x0,#8*4] 894 stp x16,x17,[x0,#8*6] 895 896 sub x27,x5,#8*4 897 Lsqr4x_cond_copy: 898 sub x27,x27,#8*4 899 csel x14,x19,x6,lo 900 stp xzr,xzr,[x2,#8*0] 901 csel x15,x20,x7,lo 902 ldp x6,x7,[x3,#8*4] 903 ldp x19,x20,[x1,#8*4] 904 csel x16,x21,x8,lo 905 stp xzr,xzr,[x2,#8*2] 906 add x2,x2,#8*4 907 csel x17,x22,x9,lo 908 ldp x8,x9,[x3,#8*6] 909 ldp x21,x22,[x1,#8*6] 910 add x1,x1,#8*4 911 stp x14,x15,[x3,#8*0] 912 stp x16,x17,[x3,#8*2] 913 add x3,x3,#8*4 914 stp xzr,xzr,[x1,#8*0] 915 stp xzr,xzr,[x1,#8*2] 916 cbnz x27,Lsqr4x_cond_copy 917 918 csel x14,x19,x6,lo 919 stp xzr,xzr,[x2,#8*0] 920 csel x15,x20,x7,lo 921 stp xzr,xzr,[x2,#8*2] 922 csel x16,x21,x8,lo 923 csel x17,x22,x9,lo 924 stp x14,x15,[x3,#8*0] 925 stp x16,x17,[x3,#8*2] 926 927 b Lsqr8x_done 928 929 .align 4 930 Lsqr8x8_post_condition: 931 adc x28,xzr,xzr 932 ldr x30,[x29,#8] // pull return address 933 // x19-7,x28 hold result, x6-7 hold modulus 934 subs x6,x19,x6 935 ldr x1,[x29,#96] // pull rp 936 sbcs x7,x20,x7 937 stp xzr,xzr,[sp,#8*0] 938 sbcs x8,x21,x8 939 stp xzr,xzr,[sp,#8*2] 940 sbcs x9,x22,x9 941 stp xzr,xzr,[sp,#8*4] 942 sbcs x10,x23,x10 943 stp xzr,xzr,[sp,#8*6] 944 sbcs x11,x24,x11 945 stp xzr,xzr,[sp,#8*8] 946 sbcs x12,x25,x12 947 stp xzr,xzr,[sp,#8*10] 948 sbcs x13,x26,x13 949 stp xzr,xzr,[sp,#8*12] 950 sbcs x28,x28,xzr // did it borrow? 951 stp xzr,xzr,[sp,#8*14] 952 953 // x6-7 hold result-modulus 954 csel x6,x19,x6,lo 955 csel x7,x20,x7,lo 956 csel x8,x21,x8,lo 957 csel x9,x22,x9,lo 958 stp x6,x7,[x1,#8*0] 959 csel x10,x23,x10,lo 960 csel x11,x24,x11,lo 961 stp x8,x9,[x1,#8*2] 962 csel x12,x25,x12,lo 963 csel x13,x26,x13,lo 964 stp x10,x11,[x1,#8*4] 965 stp x12,x13,[x1,#8*6] 966 967 Lsqr8x_done: 968 ldp x19,x20,[x29,#16] 969 mov sp,x29 970 ldp x21,x22,[x29,#32] 971 mov x0,#1 972 ldp x23,x24,[x29,#48] 973 ldp x25,x26,[x29,#64] 974 ldp x27,x28,[x29,#80] 975 ldr x29,[sp],#128 976 ret 977 978 979 .align 5 980 __bn_mul4x_mont: 981 stp x29,x30,[sp,#-128]! 982 add x29,sp,#0 983 stp x19,x20,[sp,#16] 984 stp x21,x22,[sp,#32] 985 stp x23,x24,[sp,#48] 986 stp x25,x26,[sp,#64] 987 stp x27,x28,[sp,#80] 988 989 sub x26,sp,x5,lsl#3 990 lsl x5,x5,#3 991 ldr x4,[x4] // *n0 992 sub sp,x26,#8*4 // alloca 993 994 add x10,x2,x5 995 add x27,x1,x5 996 stp x0,x10,[x29,#96] // offload rp and &b[num] 997 998 ldr x24,[x2,#8*0] // b[0] 999 ldp x6,x7,[x1,#8*0] // a[0..3] 1000 ldp x8,x9,[x1,#8*2] 1001 add x1,x1,#8*4 1002 mov x19,xzr 1003 mov x20,xzr 1004 mov x21,xzr 1005 mov x22,xzr 1006 ldp x14,x15,[x3,#8*0] // n[0..3] 1007 ldp x16,x17,[x3,#8*2] 1008 adds x3,x3,#8*4 // clear carry bit 1009 mov x0,xzr 1010 mov x28,#0 1011 mov x26,sp 1012 1013 Loop_mul4x_1st_reduction: 1014 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1015 adc x0,x0,xzr // modulo-scheduled 1016 mul x11,x7,x24 1017 add x28,x28,#8 1018 mul x12,x8,x24 1019 and x28,x28,#31 1020 mul x13,x9,x24 1021 adds x19,x19,x10 1022 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1023 adcs x20,x20,x11 1024 mul x25,x19,x4 // t[0]*n0 1025 adcs x21,x21,x12 1026 umulh x11,x7,x24 1027 adcs x22,x22,x13 1028 umulh x12,x8,x24 1029 adc x23,xzr,xzr 1030 umulh x13,x9,x24 1031 ldr x24,[x2,x28] // next b[i] (or b[0]) 1032 adds x20,x20,x10 1033 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1034 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1035 adcs x21,x21,x11 1036 mul x11,x15,x25 1037 adcs x22,x22,x12 1038 mul x12,x16,x25 1039 adc x23,x23,x13 // can't overflow 1040 mul x13,x17,x25 1041 // (*) adds xzr,x19,x10 1042 subs xzr,x19,#1 // (*) 1043 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1044 adcs x19,x20,x11 1045 umulh x11,x15,x25 1046 adcs x20,x21,x12 1047 umulh x12,x16,x25 1048 adcs x21,x22,x13 1049 umulh x13,x17,x25 1050 adcs x22,x23,x0 1051 adc x0,xzr,xzr 1052 adds x19,x19,x10 1053 sub x10,x27,x1 1054 adcs x20,x20,x11 1055 adcs x21,x21,x12 1056 adcs x22,x22,x13 1057 //adc x0,x0,xzr 1058 cbnz x28,Loop_mul4x_1st_reduction 1059 1060 cbz x10,Lmul4x4_post_condition 1061 1062 ldp x6,x7,[x1,#8*0] // a[4..7] 1063 ldp x8,x9,[x1,#8*2] 1064 add x1,x1,#8*4 1065 ldr x25,[sp] // a[0]*n0 1066 ldp x14,x15,[x3,#8*0] // n[4..7] 1067 ldp x16,x17,[x3,#8*2] 1068 add x3,x3,#8*4 1069 1070 Loop_mul4x_1st_tail: 1071 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1072 adc x0,x0,xzr // modulo-scheduled 1073 mul x11,x7,x24 1074 add x28,x28,#8 1075 mul x12,x8,x24 1076 and x28,x28,#31 1077 mul x13,x9,x24 1078 adds x19,x19,x10 1079 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1080 adcs x20,x20,x11 1081 umulh x11,x7,x24 1082 adcs x21,x21,x12 1083 umulh x12,x8,x24 1084 adcs x22,x22,x13 1085 umulh x13,x9,x24 1086 adc x23,xzr,xzr 1087 ldr x24,[x2,x28] // next b[i] (or b[0]) 1088 adds x20,x20,x10 1089 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1090 adcs x21,x21,x11 1091 mul x11,x15,x25 1092 adcs x22,x22,x12 1093 mul x12,x16,x25 1094 adc x23,x23,x13 // can't overflow 1095 mul x13,x17,x25 1096 adds x19,x19,x10 1097 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1098 adcs x20,x20,x11 1099 umulh x11,x15,x25 1100 adcs x21,x21,x12 1101 umulh x12,x16,x25 1102 adcs x22,x22,x13 1103 adcs x23,x23,x0 1104 umulh x13,x17,x25 1105 adc x0,xzr,xzr 1106 ldr x25,[sp,x28] // next t[0]*n0 1107 str x19,[x26],#8 // result!!! 1108 adds x19,x20,x10 1109 sub x10,x27,x1 // done yet? 1110 adcs x20,x21,x11 1111 adcs x21,x22,x12 1112 adcs x22,x23,x13 1113 //adc x0,x0,xzr 1114 cbnz x28,Loop_mul4x_1st_tail 1115 1116 sub x11,x27,x5 // rewinded x1 1117 cbz x10,Lmul4x_proceed 1118 1119 ldp x6,x7,[x1,#8*0] 1120 ldp x8,x9,[x1,#8*2] 1121 add x1,x1,#8*4 1122 ldp x14,x15,[x3,#8*0] 1123 ldp x16,x17,[x3,#8*2] 1124 add x3,x3,#8*4 1125 b Loop_mul4x_1st_tail 1126 1127 .align 5 1128 Lmul4x_proceed: 1129 ldr x24,[x2,#8*4]! // *++b 1130 adc x30,x0,xzr 1131 ldp x6,x7,[x11,#8*0] // a[0..3] 1132 sub x3,x3,x5 // rewind np 1133 ldp x8,x9,[x11,#8*2] 1134 add x1,x11,#8*4 1135 1136 stp x19,x20,[x26,#8*0] // result!!! 1137 ldp x19,x20,[sp,#8*4] // t[0..3] 1138 stp x21,x22,[x26,#8*2] // result!!! 1139 ldp x21,x22,[sp,#8*6] 1140 1141 ldp x14,x15,[x3,#8*0] // n[0..3] 1142 mov x26,sp 1143 ldp x16,x17,[x3,#8*2] 1144 adds x3,x3,#8*4 // clear carry bit 1145 mov x0,xzr 1146 1147 .align 4 1148 Loop_mul4x_reduction: 1149 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1150 adc x0,x0,xzr // modulo-scheduled 1151 mul x11,x7,x24 1152 add x28,x28,#8 1153 mul x12,x8,x24 1154 and x28,x28,#31 1155 mul x13,x9,x24 1156 adds x19,x19,x10 1157 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1158 adcs x20,x20,x11 1159 mul x25,x19,x4 // t[0]*n0 1160 adcs x21,x21,x12 1161 umulh x11,x7,x24 1162 adcs x22,x22,x13 1163 umulh x12,x8,x24 1164 adc x23,xzr,xzr 1165 umulh x13,x9,x24 1166 ldr x24,[x2,x28] // next b[i] 1167 adds x20,x20,x10 1168 // (*) mul x10,x14,x25 1169 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1170 adcs x21,x21,x11 1171 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1172 adcs x22,x22,x12 1173 mul x12,x16,x25 1174 adc x23,x23,x13 // can't overflow 1175 mul x13,x17,x25 1176 // (*) adds xzr,x19,x10 1177 subs xzr,x19,#1 // (*) 1178 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1179 adcs x19,x20,x11 1180 umulh x11,x15,x25 1181 adcs x20,x21,x12 1182 umulh x12,x16,x25 1183 adcs x21,x22,x13 1184 umulh x13,x17,x25 1185 adcs x22,x23,x0 1186 adc x0,xzr,xzr 1187 adds x19,x19,x10 1188 adcs x20,x20,x11 1189 adcs x21,x21,x12 1190 adcs x22,x22,x13 1191 //adc x0,x0,xzr 1192 cbnz x28,Loop_mul4x_reduction 1193 1194 adc x0,x0,xzr 1195 ldp x10,x11,[x26,#8*4] // t[4..7] 1196 ldp x12,x13,[x26,#8*6] 1197 ldp x6,x7,[x1,#8*0] // a[4..7] 1198 ldp x8,x9,[x1,#8*2] 1199 add x1,x1,#8*4 1200 adds x19,x19,x10 1201 adcs x20,x20,x11 1202 adcs x21,x21,x12 1203 adcs x22,x22,x13 1204 //adc x0,x0,xzr 1205 1206 ldr x25,[sp] // t[0]*n0 1207 ldp x14,x15,[x3,#8*0] // n[4..7] 1208 ldp x16,x17,[x3,#8*2] 1209 add x3,x3,#8*4 1210 1211 .align 4 1212 Loop_mul4x_tail: 1213 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1214 adc x0,x0,xzr // modulo-scheduled 1215 mul x11,x7,x24 1216 add x28,x28,#8 1217 mul x12,x8,x24 1218 and x28,x28,#31 1219 mul x13,x9,x24 1220 adds x19,x19,x10 1221 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1222 adcs x20,x20,x11 1223 umulh x11,x7,x24 1224 adcs x21,x21,x12 1225 umulh x12,x8,x24 1226 adcs x22,x22,x13 1227 umulh x13,x9,x24 1228 adc x23,xzr,xzr 1229 ldr x24,[x2,x28] // next b[i] 1230 adds x20,x20,x10 1231 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1232 adcs x21,x21,x11 1233 mul x11,x15,x25 1234 adcs x22,x22,x12 1235 mul x12,x16,x25 1236 adc x23,x23,x13 // can't overflow 1237 mul x13,x17,x25 1238 adds x19,x19,x10 1239 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1240 adcs x20,x20,x11 1241 umulh x11,x15,x25 1242 adcs x21,x21,x12 1243 umulh x12,x16,x25 1244 adcs x22,x22,x13 1245 umulh x13,x17,x25 1246 adcs x23,x23,x0 1247 ldr x25,[sp,x28] // next a[0]*n0 1248 adc x0,xzr,xzr 1249 str x19,[x26],#8 // result!!! 1250 adds x19,x20,x10 1251 sub x10,x27,x1 // done yet? 1252 adcs x20,x21,x11 1253 adcs x21,x22,x12 1254 adcs x22,x23,x13 1255 //adc x0,x0,xzr 1256 cbnz x28,Loop_mul4x_tail 1257 1258 sub x11,x3,x5 // rewinded np? 1259 adc x0,x0,xzr 1260 cbz x10,Loop_mul4x_break 1261 1262 ldp x10,x11,[x26,#8*4] 1263 ldp x12,x13,[x26,#8*6] 1264 ldp x6,x7,[x1,#8*0] 1265 ldp x8,x9,[x1,#8*2] 1266 add x1,x1,#8*4 1267 adds x19,x19,x10 1268 adcs x20,x20,x11 1269 adcs x21,x21,x12 1270 adcs x22,x22,x13 1271 //adc x0,x0,xzr 1272 ldp x14,x15,[x3,#8*0] 1273 ldp x16,x17,[x3,#8*2] 1274 add x3,x3,#8*4 1275 b Loop_mul4x_tail 1276 1277 .align 4 1278 Loop_mul4x_break: 1279 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1280 adds x19,x19,x30 1281 add x2,x2,#8*4 // bp++ 1282 adcs x20,x20,xzr 1283 sub x1,x1,x5 // rewind ap 1284 adcs x21,x21,xzr 1285 stp x19,x20,[x26,#8*0] // result!!! 1286 adcs x22,x22,xzr 1287 ldp x19,x20,[sp,#8*4] // t[0..3] 1288 adc x30,x0,xzr 1289 stp x21,x22,[x26,#8*2] // result!!! 1290 cmp x2,x13 // done yet? 1291 ldp x21,x22,[sp,#8*6] 1292 ldp x14,x15,[x11,#8*0] // n[0..3] 1293 ldp x16,x17,[x11,#8*2] 1294 add x3,x11,#8*4 1295 b.eq Lmul4x_post 1296 1297 ldr x24,[x2] 1298 ldp x6,x7,[x1,#8*0] // a[0..3] 1299 ldp x8,x9,[x1,#8*2] 1300 adds x1,x1,#8*4 // clear carry bit 1301 mov x0,xzr 1302 mov x26,sp 1303 b Loop_mul4x_reduction 1304 1305 .align 4 1306 Lmul4x_post: 1307 // Final step. We see if result is larger than modulus, and 1308 // if it is, subtract the modulus. But comparison implies 1309 // subtraction. So we subtract modulus, see if it borrowed, 1310 // and conditionally copy original value. 1311 mov x0,x12 1312 mov x27,x12 // x0 copy 1313 subs x10,x19,x14 1314 add x26,sp,#8*8 1315 sbcs x11,x20,x15 1316 sub x28,x5,#8*4 1317 1318 Lmul4x_sub: 1319 sbcs x12,x21,x16 1320 ldp x14,x15,[x3,#8*0] 1321 sub x28,x28,#8*4 1322 ldp x19,x20,[x26,#8*0] 1323 sbcs x13,x22,x17 1324 ldp x16,x17,[x3,#8*2] 1325 add x3,x3,#8*4 1326 ldp x21,x22,[x26,#8*2] 1327 add x26,x26,#8*4 1328 stp x10,x11,[x0,#8*0] 1329 sbcs x10,x19,x14 1330 stp x12,x13,[x0,#8*2] 1331 add x0,x0,#8*4 1332 sbcs x11,x20,x15 1333 cbnz x28,Lmul4x_sub 1334 1335 sbcs x12,x21,x16 1336 mov x26,sp 1337 add x1,sp,#8*4 1338 ldp x6,x7,[x27,#8*0] 1339 sbcs x13,x22,x17 1340 stp x10,x11,[x0,#8*0] 1341 ldp x8,x9,[x27,#8*2] 1342 stp x12,x13,[x0,#8*2] 1343 ldp x19,x20,[x1,#8*0] 1344 ldp x21,x22,[x1,#8*2] 1345 sbcs xzr,x30,xzr // did it borrow? 1346 ldr x30,[x29,#8] // pull return address 1347 1348 sub x28,x5,#8*4 1349 Lmul4x_cond_copy: 1350 sub x28,x28,#8*4 1351 csel x10,x19,x6,lo 1352 stp xzr,xzr,[x26,#8*0] 1353 csel x11,x20,x7,lo 1354 ldp x6,x7,[x27,#8*4] 1355 ldp x19,x20,[x1,#8*4] 1356 csel x12,x21,x8,lo 1357 stp xzr,xzr,[x26,#8*2] 1358 add x26,x26,#8*4 1359 csel x13,x22,x9,lo 1360 ldp x8,x9,[x27,#8*6] 1361 ldp x21,x22,[x1,#8*6] 1362 add x1,x1,#8*4 1363 stp x10,x11,[x27,#8*0] 1364 stp x12,x13,[x27,#8*2] 1365 add x27,x27,#8*4 1366 cbnz x28,Lmul4x_cond_copy 1367 1368 csel x10,x19,x6,lo 1369 stp xzr,xzr,[x26,#8*0] 1370 csel x11,x20,x7,lo 1371 stp xzr,xzr,[x26,#8*2] 1372 csel x12,x21,x8,lo 1373 stp xzr,xzr,[x26,#8*3] 1374 csel x13,x22,x9,lo 1375 stp xzr,xzr,[x26,#8*4] 1376 stp x10,x11,[x27,#8*0] 1377 stp x12,x13,[x27,#8*2] 1378 1379 b Lmul4x_done 1380 1381 .align 4 1382 Lmul4x4_post_condition: 1383 adc x0,x0,xzr 1384 ldr x1,[x29,#96] // pull rp 1385 // x19-3,x0 hold result, x14-7 hold modulus 1386 subs x6,x19,x14 1387 ldr x30,[x29,#8] // pull return address 1388 sbcs x7,x20,x15 1389 stp xzr,xzr,[sp,#8*0] 1390 sbcs x8,x21,x16 1391 stp xzr,xzr,[sp,#8*2] 1392 sbcs x9,x22,x17 1393 stp xzr,xzr,[sp,#8*4] 1394 sbcs xzr,x0,xzr // did it borrow? 1395 stp xzr,xzr,[sp,#8*6] 1396 1397 // x6-3 hold result-modulus 1398 csel x6,x19,x6,lo 1399 csel x7,x20,x7,lo 1400 csel x8,x21,x8,lo 1401 csel x9,x22,x9,lo 1402 stp x6,x7,[x1,#8*0] 1403 stp x8,x9,[x1,#8*2] 1404 1405 Lmul4x_done: 1406 ldp x19,x20,[x29,#16] 1407 mov sp,x29 1408 ldp x21,x22,[x29,#32] 1409 mov x0,#1 1410 ldp x23,x24,[x29,#48] 1411 ldp x25,x26,[x29,#64] 1412 ldp x27,x28,[x29,#80] 1413 ldr x29,[sp],#128 1414 ret 1415 1416 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1417 .align 2 1418 .align 4 1419 #endif // !OPENSSL_NO_ASM 1420