1 // This file is generated from a similarly-named Perl script in the BoringSSL 2 // source tree. Do not edit by hand. 3 4 #if defined(__has_feature) 5 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6 #define OPENSSL_NO_ASM 7 #endif 8 #endif 9 10 #if !defined(OPENSSL_NO_ASM) 11 #if defined(__aarch64__) 12 #if defined(BORINGSSL_PREFIX) 13 #include <boringssl_prefix_symbols_asm.h> 14 #endif 15 .text 16 17 .globl bn_mul_mont 18 .hidden bn_mul_mont 19 .type bn_mul_mont,%function 20 .align 5 21 bn_mul_mont: 22 tst x5,#7 23 b.eq __bn_sqr8x_mont 24 tst x5,#3 25 b.eq __bn_mul4x_mont 26 .Lmul_mont: 27 stp x29,x30,[sp,#-64]! 28 add x29,sp,#0 29 stp x19,x20,[sp,#16] 30 stp x21,x22,[sp,#32] 31 stp x23,x24,[sp,#48] 32 33 ldr x9,[x2],#8 // bp[0] 34 sub x22,sp,x5,lsl#3 35 ldp x7,x8,[x1],#16 // ap[0..1] 36 lsl x5,x5,#3 37 ldr x4,[x4] // *n0 38 and x22,x22,#-16 // ABI says so 39 ldp x13,x14,[x3],#16 // np[0..1] 40 41 mul x6,x7,x9 // ap[0]*bp[0] 42 sub x21,x5,#16 // j=num-2 43 umulh x7,x7,x9 44 mul x10,x8,x9 // ap[1]*bp[0] 45 umulh x11,x8,x9 46 47 mul x15,x6,x4 // "tp[0]"*n0 48 mov sp,x22 // alloca 49 50 // (*) mul x12,x13,x15 // np[0]*m1 51 umulh x13,x13,x15 52 mul x16,x14,x15 // np[1]*m1 53 // (*) adds x12,x12,x6 // discarded 54 // (*) As for removal of first multiplication and addition 55 // instructions. The outcome of first addition is 56 // guaranteed to be zero, which leaves two computationally 57 // significant outcomes: it either carries or not. Then 58 // question is when does it carry? Is there alternative 59 // way to deduce it? If you follow operations, you can 60 // observe that condition for carry is quite simple: 61 // x6 being non-zero. So that carry can be calculated 62 // by adding -1 to x6. That's what next instruction does. 63 subs xzr,x6,#1 // (*) 64 umulh x17,x14,x15 65 adc x13,x13,xzr 66 cbz x21,.L1st_skip 67 68 .L1st: 69 ldr x8,[x1],#8 70 adds x6,x10,x7 71 sub x21,x21,#8 // j-- 72 adc x7,x11,xzr 73 74 ldr x14,[x3],#8 75 adds x12,x16,x13 76 mul x10,x8,x9 // ap[j]*bp[0] 77 adc x13,x17,xzr 78 umulh x11,x8,x9 79 80 adds x12,x12,x6 81 mul x16,x14,x15 // np[j]*m1 82 adc x13,x13,xzr 83 umulh x17,x14,x15 84 str x12,[x22],#8 // tp[j-1] 85 cbnz x21,.L1st 86 87 .L1st_skip: 88 adds x6,x10,x7 89 sub x1,x1,x5 // rewind x1 90 adc x7,x11,xzr 91 92 adds x12,x16,x13 93 sub x3,x3,x5 // rewind x3 94 adc x13,x17,xzr 95 96 adds x12,x12,x6 97 sub x20,x5,#8 // i=num-1 98 adcs x13,x13,x7 99 100 adc x19,xzr,xzr // upmost overflow bit 101 stp x12,x13,[x22] 102 103 .Louter: 104 ldr x9,[x2],#8 // bp[i] 105 ldp x7,x8,[x1],#16 106 ldr x23,[sp] // tp[0] 107 add x22,sp,#8 108 109 mul x6,x7,x9 // ap[0]*bp[i] 110 sub x21,x5,#16 // j=num-2 111 umulh x7,x7,x9 112 ldp x13,x14,[x3],#16 113 mul x10,x8,x9 // ap[1]*bp[i] 114 adds x6,x6,x23 115 umulh x11,x8,x9 116 adc x7,x7,xzr 117 118 mul x15,x6,x4 119 sub x20,x20,#8 // i-- 120 121 // (*) mul x12,x13,x15 // np[0]*m1 122 umulh x13,x13,x15 123 mul x16,x14,x15 // np[1]*m1 124 // (*) adds x12,x12,x6 125 subs xzr,x6,#1 // (*) 126 umulh x17,x14,x15 127 cbz x21,.Linner_skip 128 129 .Linner: 130 ldr x8,[x1],#8 131 adc x13,x13,xzr 132 ldr x23,[x22],#8 // tp[j] 133 adds x6,x10,x7 134 sub x21,x21,#8 // j-- 135 adc x7,x11,xzr 136 137 adds x12,x16,x13 138 ldr x14,[x3],#8 139 adc x13,x17,xzr 140 141 mul x10,x8,x9 // ap[j]*bp[i] 142 adds x6,x6,x23 143 umulh x11,x8,x9 144 adc x7,x7,xzr 145 146 mul x16,x14,x15 // np[j]*m1 147 adds x12,x12,x6 148 umulh x17,x14,x15 149 str x12,[x22,#-16] // tp[j-1] 150 cbnz x21,.Linner 151 152 .Linner_skip: 153 ldr x23,[x22],#8 // tp[j] 154 adc x13,x13,xzr 155 adds x6,x10,x7 156 sub x1,x1,x5 // rewind x1 157 adc x7,x11,xzr 158 159 adds x12,x16,x13 160 sub x3,x3,x5 // rewind x3 161 adcs x13,x17,x19 162 adc x19,xzr,xzr 163 164 adds x6,x6,x23 165 adc x7,x7,xzr 166 167 adds x12,x12,x6 168 adcs x13,x13,x7 169 adc x19,x19,xzr // upmost overflow bit 170 stp x12,x13,[x22,#-16] 171 172 cbnz x20,.Louter 173 174 // Final step. We see if result is larger than modulus, and 175 // if it is, subtract the modulus. But comparison implies 176 // subtraction. So we subtract modulus, see if it borrowed, 177 // and conditionally copy original value. 178 ldr x23,[sp] // tp[0] 179 add x22,sp,#8 180 ldr x14,[x3],#8 // np[0] 181 subs x21,x5,#8 // j=num-1 and clear borrow 182 mov x1,x0 183 .Lsub: 184 sbcs x8,x23,x14 // tp[j]-np[j] 185 ldr x23,[x22],#8 186 sub x21,x21,#8 // j-- 187 ldr x14,[x3],#8 188 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 189 cbnz x21,.Lsub 190 191 sbcs x8,x23,x14 192 sbcs x19,x19,xzr // did it borrow? 193 str x8,[x1],#8 // rp[num-1] 194 195 ldr x23,[sp] // tp[0] 196 add x22,sp,#8 197 ldr x8,[x0],#8 // rp[0] 198 sub x5,x5,#8 // num-- 199 nop 200 .Lcond_copy: 201 sub x5,x5,#8 // num-- 202 csel x14,x23,x8,lo // did it borrow? 203 ldr x23,[x22],#8 204 ldr x8,[x0],#8 205 str xzr,[x22,#-16] // wipe tp 206 str x14,[x0,#-16] 207 cbnz x5,.Lcond_copy 208 209 csel x14,x23,x8,lo 210 str xzr,[x22,#-8] // wipe tp 211 str x14,[x0,#-8] 212 213 ldp x19,x20,[x29,#16] 214 mov sp,x29 215 ldp x21,x22,[x29,#32] 216 mov x0,#1 217 ldp x23,x24,[x29,#48] 218 ldr x29,[sp],#64 219 ret 220 .size bn_mul_mont,.-bn_mul_mont 221 .type __bn_sqr8x_mont,%function 222 .align 5 223 __bn_sqr8x_mont: 224 cmp x1,x2 225 b.ne __bn_mul4x_mont 226 .Lsqr8x_mont: 227 stp x29,x30,[sp,#-128]! 228 add x29,sp,#0 229 stp x19,x20,[sp,#16] 230 stp x21,x22,[sp,#32] 231 stp x23,x24,[sp,#48] 232 stp x25,x26,[sp,#64] 233 stp x27,x28,[sp,#80] 234 stp x0,x3,[sp,#96] // offload rp and np 235 236 ldp x6,x7,[x1,#8*0] 237 ldp x8,x9,[x1,#8*2] 238 ldp x10,x11,[x1,#8*4] 239 ldp x12,x13,[x1,#8*6] 240 241 sub x2,sp,x5,lsl#4 242 lsl x5,x5,#3 243 ldr x4,[x4] // *n0 244 mov sp,x2 // alloca 245 sub x27,x5,#8*8 246 b .Lsqr8x_zero_start 247 248 .Lsqr8x_zero: 249 sub x27,x27,#8*8 250 stp xzr,xzr,[x2,#8*0] 251 stp xzr,xzr,[x2,#8*2] 252 stp xzr,xzr,[x2,#8*4] 253 stp xzr,xzr,[x2,#8*6] 254 .Lsqr8x_zero_start: 255 stp xzr,xzr,[x2,#8*8] 256 stp xzr,xzr,[x2,#8*10] 257 stp xzr,xzr,[x2,#8*12] 258 stp xzr,xzr,[x2,#8*14] 259 add x2,x2,#8*16 260 cbnz x27,.Lsqr8x_zero 261 262 add x3,x1,x5 263 add x1,x1,#8*8 264 mov x19,xzr 265 mov x20,xzr 266 mov x21,xzr 267 mov x22,xzr 268 mov x23,xzr 269 mov x24,xzr 270 mov x25,xzr 271 mov x26,xzr 272 mov x2,sp 273 str x4,[x29,#112] // offload n0 274 275 // Multiply everything but a[i]*a[i] 276 .align 4 277 .Lsqr8x_outer_loop: 278 // a[1]a[0] (i) 279 // a[2]a[0] 280 // a[3]a[0] 281 // a[4]a[0] 282 // a[5]a[0] 283 // a[6]a[0] 284 // a[7]a[0] 285 // a[2]a[1] (ii) 286 // a[3]a[1] 287 // a[4]a[1] 288 // a[5]a[1] 289 // a[6]a[1] 290 // a[7]a[1] 291 // a[3]a[2] (iii) 292 // a[4]a[2] 293 // a[5]a[2] 294 // a[6]a[2] 295 // a[7]a[2] 296 // a[4]a[3] (iv) 297 // a[5]a[3] 298 // a[6]a[3] 299 // a[7]a[3] 300 // a[5]a[4] (v) 301 // a[6]a[4] 302 // a[7]a[4] 303 // a[6]a[5] (vi) 304 // a[7]a[5] 305 // a[7]a[6] (vii) 306 307 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 308 mul x15,x8,x6 309 mul x16,x9,x6 310 mul x17,x10,x6 311 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 312 mul x14,x11,x6 313 adcs x21,x21,x15 314 mul x15,x12,x6 315 adcs x22,x22,x16 316 mul x16,x13,x6 317 adcs x23,x23,x17 318 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 319 adcs x24,x24,x14 320 umulh x14,x8,x6 321 adcs x25,x25,x15 322 umulh x15,x9,x6 323 adcs x26,x26,x16 324 umulh x16,x10,x6 325 stp x19,x20,[x2],#8*2 // t[0..1] 326 adc x19,xzr,xzr // t[8] 327 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 328 umulh x17,x11,x6 329 adcs x22,x22,x14 330 umulh x14,x12,x6 331 adcs x23,x23,x15 332 umulh x15,x13,x6 333 adcs x24,x24,x16 334 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 335 adcs x25,x25,x17 336 mul x17,x9,x7 337 adcs x26,x26,x14 338 mul x14,x10,x7 339 adc x19,x19,x15 340 341 mul x15,x11,x7 342 adds x22,x22,x16 343 mul x16,x12,x7 344 adcs x23,x23,x17 345 mul x17,x13,x7 346 adcs x24,x24,x14 347 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 348 adcs x25,x25,x15 349 umulh x15,x9,x7 350 adcs x26,x26,x16 351 umulh x16,x10,x7 352 adcs x19,x19,x17 353 umulh x17,x11,x7 354 stp x21,x22,[x2],#8*2 // t[2..3] 355 adc x20,xzr,xzr // t[9] 356 adds x23,x23,x14 357 umulh x14,x12,x7 358 adcs x24,x24,x15 359 umulh x15,x13,x7 360 adcs x25,x25,x16 361 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 362 adcs x26,x26,x17 363 mul x17,x10,x8 364 adcs x19,x19,x14 365 mul x14,x11,x8 366 adc x20,x20,x15 367 368 mul x15,x12,x8 369 adds x24,x24,x16 370 mul x16,x13,x8 371 adcs x25,x25,x17 372 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 373 adcs x26,x26,x14 374 umulh x14,x10,x8 375 adcs x19,x19,x15 376 umulh x15,x11,x8 377 adcs x20,x20,x16 378 umulh x16,x12,x8 379 stp x23,x24,[x2],#8*2 // t[4..5] 380 adc x21,xzr,xzr // t[10] 381 adds x25,x25,x17 382 umulh x17,x13,x8 383 adcs x26,x26,x14 384 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 385 adcs x19,x19,x15 386 mul x15,x11,x9 387 adcs x20,x20,x16 388 mul x16,x12,x9 389 adc x21,x21,x17 390 391 mul x17,x13,x9 392 adds x26,x26,x14 393 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 394 adcs x19,x19,x15 395 umulh x15,x11,x9 396 adcs x20,x20,x16 397 umulh x16,x12,x9 398 adcs x21,x21,x17 399 umulh x17,x13,x9 400 stp x25,x26,[x2],#8*2 // t[6..7] 401 adc x22,xzr,xzr // t[11] 402 adds x19,x19,x14 403 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 404 adcs x20,x20,x15 405 mul x15,x12,x10 406 adcs x21,x21,x16 407 mul x16,x13,x10 408 adc x22,x22,x17 409 410 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 411 adds x20,x20,x14 412 umulh x14,x12,x10 413 adcs x21,x21,x15 414 umulh x15,x13,x10 415 adcs x22,x22,x16 416 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 417 adc x23,xzr,xzr // t[12] 418 adds x21,x21,x17 419 mul x17,x13,x11 420 adcs x22,x22,x14 421 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 422 adc x23,x23,x15 423 424 umulh x15,x13,x11 425 adds x22,x22,x16 426 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 427 adcs x23,x23,x17 428 umulh x17,x13,x12 // hi(a[7]*a[6]) 429 adc x24,xzr,xzr // t[13] 430 adds x23,x23,x14 431 sub x27,x3,x1 // done yet? 432 adc x24,x24,x15 433 434 adds x24,x24,x16 435 sub x14,x3,x5 // rewinded ap 436 adc x25,xzr,xzr // t[14] 437 add x25,x25,x17 438 439 cbz x27,.Lsqr8x_outer_break 440 441 mov x4,x6 442 ldp x6,x7,[x2,#8*0] 443 ldp x8,x9,[x2,#8*2] 444 ldp x10,x11,[x2,#8*4] 445 ldp x12,x13,[x2,#8*6] 446 adds x19,x19,x6 447 adcs x20,x20,x7 448 ldp x6,x7,[x1,#8*0] 449 adcs x21,x21,x8 450 adcs x22,x22,x9 451 ldp x8,x9,[x1,#8*2] 452 adcs x23,x23,x10 453 adcs x24,x24,x11 454 ldp x10,x11,[x1,#8*4] 455 adcs x25,x25,x12 456 mov x0,x1 457 adcs x26,xzr,x13 458 ldp x12,x13,[x1,#8*6] 459 add x1,x1,#8*8 460 //adc x28,xzr,xzr // moved below 461 mov x27,#-8*8 462 463 // a[8]a[0] 464 // a[9]a[0] 465 // a[a]a[0] 466 // a[b]a[0] 467 // a[c]a[0] 468 // a[d]a[0] 469 // a[e]a[0] 470 // a[f]a[0] 471 // a[8]a[1] 472 // a[f]a[1]........................ 473 // a[8]a[2] 474 // a[f]a[2]........................ 475 // a[8]a[3] 476 // a[f]a[3]........................ 477 // a[8]a[4] 478 // a[f]a[4]........................ 479 // a[8]a[5] 480 // a[f]a[5]........................ 481 // a[8]a[6] 482 // a[f]a[6]........................ 483 // a[8]a[7] 484 // a[f]a[7]........................ 485 .Lsqr8x_mul: 486 mul x14,x6,x4 487 adc x28,xzr,xzr // carry bit, modulo-scheduled 488 mul x15,x7,x4 489 add x27,x27,#8 490 mul x16,x8,x4 491 mul x17,x9,x4 492 adds x19,x19,x14 493 mul x14,x10,x4 494 adcs x20,x20,x15 495 mul x15,x11,x4 496 adcs x21,x21,x16 497 mul x16,x12,x4 498 adcs x22,x22,x17 499 mul x17,x13,x4 500 adcs x23,x23,x14 501 umulh x14,x6,x4 502 adcs x24,x24,x15 503 umulh x15,x7,x4 504 adcs x25,x25,x16 505 umulh x16,x8,x4 506 adcs x26,x26,x17 507 umulh x17,x9,x4 508 adc x28,x28,xzr 509 str x19,[x2],#8 510 adds x19,x20,x14 511 umulh x14,x10,x4 512 adcs x20,x21,x15 513 umulh x15,x11,x4 514 adcs x21,x22,x16 515 umulh x16,x12,x4 516 adcs x22,x23,x17 517 umulh x17,x13,x4 518 ldr x4,[x0,x27] 519 adcs x23,x24,x14 520 adcs x24,x25,x15 521 adcs x25,x26,x16 522 adcs x26,x28,x17 523 //adc x28,xzr,xzr // moved above 524 cbnz x27,.Lsqr8x_mul 525 // note that carry flag is guaranteed 526 // to be zero at this point 527 cmp x1,x3 // done yet? 528 b.eq .Lsqr8x_break 529 530 ldp x6,x7,[x2,#8*0] 531 ldp x8,x9,[x2,#8*2] 532 ldp x10,x11,[x2,#8*4] 533 ldp x12,x13,[x2,#8*6] 534 adds x19,x19,x6 535 ldr x4,[x0,#-8*8] 536 adcs x20,x20,x7 537 ldp x6,x7,[x1,#8*0] 538 adcs x21,x21,x8 539 adcs x22,x22,x9 540 ldp x8,x9,[x1,#8*2] 541 adcs x23,x23,x10 542 adcs x24,x24,x11 543 ldp x10,x11,[x1,#8*4] 544 adcs x25,x25,x12 545 mov x27,#-8*8 546 adcs x26,x26,x13 547 ldp x12,x13,[x1,#8*6] 548 add x1,x1,#8*8 549 //adc x28,xzr,xzr // moved above 550 b .Lsqr8x_mul 551 552 .align 4 553 .Lsqr8x_break: 554 ldp x6,x7,[x0,#8*0] 555 add x1,x0,#8*8 556 ldp x8,x9,[x0,#8*2] 557 sub x14,x3,x1 // is it last iteration? 558 ldp x10,x11,[x0,#8*4] 559 sub x15,x2,x14 560 ldp x12,x13,[x0,#8*6] 561 cbz x14,.Lsqr8x_outer_loop 562 563 stp x19,x20,[x2,#8*0] 564 ldp x19,x20,[x15,#8*0] 565 stp x21,x22,[x2,#8*2] 566 ldp x21,x22,[x15,#8*2] 567 stp x23,x24,[x2,#8*4] 568 ldp x23,x24,[x15,#8*4] 569 stp x25,x26,[x2,#8*6] 570 mov x2,x15 571 ldp x25,x26,[x15,#8*6] 572 b .Lsqr8x_outer_loop 573 574 .align 4 575 .Lsqr8x_outer_break: 576 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 577 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 578 ldp x15,x16,[sp,#8*1] 579 ldp x11,x13,[x14,#8*2] 580 add x1,x14,#8*4 581 ldp x17,x14,[sp,#8*3] 582 583 stp x19,x20,[x2,#8*0] 584 mul x19,x7,x7 585 stp x21,x22,[x2,#8*2] 586 umulh x7,x7,x7 587 stp x23,x24,[x2,#8*4] 588 mul x8,x9,x9 589 stp x25,x26,[x2,#8*6] 590 mov x2,sp 591 umulh x9,x9,x9 592 adds x20,x7,x15,lsl#1 593 extr x15,x16,x15,#63 594 sub x27,x5,#8*4 595 596 .Lsqr4x_shift_n_add: 597 adcs x21,x8,x15 598 extr x16,x17,x16,#63 599 sub x27,x27,#8*4 600 adcs x22,x9,x16 601 ldp x15,x16,[x2,#8*5] 602 mul x10,x11,x11 603 ldp x7,x9,[x1],#8*2 604 umulh x11,x11,x11 605 mul x12,x13,x13 606 umulh x13,x13,x13 607 extr x17,x14,x17,#63 608 stp x19,x20,[x2,#8*0] 609 adcs x23,x10,x17 610 extr x14,x15,x14,#63 611 stp x21,x22,[x2,#8*2] 612 adcs x24,x11,x14 613 ldp x17,x14,[x2,#8*7] 614 extr x15,x16,x15,#63 615 adcs x25,x12,x15 616 extr x16,x17,x16,#63 617 adcs x26,x13,x16 618 ldp x15,x16,[x2,#8*9] 619 mul x6,x7,x7 620 ldp x11,x13,[x1],#8*2 621 umulh x7,x7,x7 622 mul x8,x9,x9 623 umulh x9,x9,x9 624 stp x23,x24,[x2,#8*4] 625 extr x17,x14,x17,#63 626 stp x25,x26,[x2,#8*6] 627 add x2,x2,#8*8 628 adcs x19,x6,x17 629 extr x14,x15,x14,#63 630 adcs x20,x7,x14 631 ldp x17,x14,[x2,#8*3] 632 extr x15,x16,x15,#63 633 cbnz x27,.Lsqr4x_shift_n_add 634 ldp x1,x4,[x29,#104] // pull np and n0 635 636 adcs x21,x8,x15 637 extr x16,x17,x16,#63 638 adcs x22,x9,x16 639 ldp x15,x16,[x2,#8*5] 640 mul x10,x11,x11 641 umulh x11,x11,x11 642 stp x19,x20,[x2,#8*0] 643 mul x12,x13,x13 644 umulh x13,x13,x13 645 stp x21,x22,[x2,#8*2] 646 extr x17,x14,x17,#63 647 adcs x23,x10,x17 648 extr x14,x15,x14,#63 649 ldp x19,x20,[sp,#8*0] 650 adcs x24,x11,x14 651 extr x15,x16,x15,#63 652 ldp x6,x7,[x1,#8*0] 653 adcs x25,x12,x15 654 extr x16,xzr,x16,#63 655 ldp x8,x9,[x1,#8*2] 656 adc x26,x13,x16 657 ldp x10,x11,[x1,#8*4] 658 659 // Reduce by 512 bits per iteration 660 mul x28,x4,x19 // t[0]*n0 661 ldp x12,x13,[x1,#8*6] 662 add x3,x1,x5 663 ldp x21,x22,[sp,#8*2] 664 stp x23,x24,[x2,#8*4] 665 ldp x23,x24,[sp,#8*4] 666 stp x25,x26,[x2,#8*6] 667 ldp x25,x26,[sp,#8*6] 668 add x1,x1,#8*8 669 mov x30,xzr // initial top-most carry 670 mov x2,sp 671 mov x27,#8 672 673 .Lsqr8x_reduction: 674 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 675 mul x15,x7,x28 676 sub x27,x27,#1 677 mul x16,x8,x28 678 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 679 mul x17,x9,x28 680 // (*) adds xzr,x19,x14 681 subs xzr,x19,#1 // (*) 682 mul x14,x10,x28 683 adcs x19,x20,x15 684 mul x15,x11,x28 685 adcs x20,x21,x16 686 mul x16,x12,x28 687 adcs x21,x22,x17 688 mul x17,x13,x28 689 adcs x22,x23,x14 690 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 691 adcs x23,x24,x15 692 umulh x15,x7,x28 693 adcs x24,x25,x16 694 umulh x16,x8,x28 695 adcs x25,x26,x17 696 umulh x17,x9,x28 697 adc x26,xzr,xzr 698 adds x19,x19,x14 699 umulh x14,x10,x28 700 adcs x20,x20,x15 701 umulh x15,x11,x28 702 adcs x21,x21,x16 703 umulh x16,x12,x28 704 adcs x22,x22,x17 705 umulh x17,x13,x28 706 mul x28,x4,x19 // next t[0]*n0 707 adcs x23,x23,x14 708 adcs x24,x24,x15 709 adcs x25,x25,x16 710 adc x26,x26,x17 711 cbnz x27,.Lsqr8x_reduction 712 713 ldp x14,x15,[x2,#8*0] 714 ldp x16,x17,[x2,#8*2] 715 mov x0,x2 716 sub x27,x3,x1 // done yet? 717 adds x19,x19,x14 718 adcs x20,x20,x15 719 ldp x14,x15,[x2,#8*4] 720 adcs x21,x21,x16 721 adcs x22,x22,x17 722 ldp x16,x17,[x2,#8*6] 723 adcs x23,x23,x14 724 adcs x24,x24,x15 725 adcs x25,x25,x16 726 adcs x26,x26,x17 727 //adc x28,xzr,xzr // moved below 728 cbz x27,.Lsqr8x8_post_condition 729 730 ldr x4,[x2,#-8*8] 731 ldp x6,x7,[x1,#8*0] 732 ldp x8,x9,[x1,#8*2] 733 ldp x10,x11,[x1,#8*4] 734 mov x27,#-8*8 735 ldp x12,x13,[x1,#8*6] 736 add x1,x1,#8*8 737 738 .Lsqr8x_tail: 739 mul x14,x6,x4 740 adc x28,xzr,xzr // carry bit, modulo-scheduled 741 mul x15,x7,x4 742 add x27,x27,#8 743 mul x16,x8,x4 744 mul x17,x9,x4 745 adds x19,x19,x14 746 mul x14,x10,x4 747 adcs x20,x20,x15 748 mul x15,x11,x4 749 adcs x21,x21,x16 750 mul x16,x12,x4 751 adcs x22,x22,x17 752 mul x17,x13,x4 753 adcs x23,x23,x14 754 umulh x14,x6,x4 755 adcs x24,x24,x15 756 umulh x15,x7,x4 757 adcs x25,x25,x16 758 umulh x16,x8,x4 759 adcs x26,x26,x17 760 umulh x17,x9,x4 761 adc x28,x28,xzr 762 str x19,[x2],#8 763 adds x19,x20,x14 764 umulh x14,x10,x4 765 adcs x20,x21,x15 766 umulh x15,x11,x4 767 adcs x21,x22,x16 768 umulh x16,x12,x4 769 adcs x22,x23,x17 770 umulh x17,x13,x4 771 ldr x4,[x0,x27] 772 adcs x23,x24,x14 773 adcs x24,x25,x15 774 adcs x25,x26,x16 775 adcs x26,x28,x17 776 //adc x28,xzr,xzr // moved above 777 cbnz x27,.Lsqr8x_tail 778 // note that carry flag is guaranteed 779 // to be zero at this point 780 ldp x6,x7,[x2,#8*0] 781 sub x27,x3,x1 // done yet? 782 sub x16,x3,x5 // rewinded np 783 ldp x8,x9,[x2,#8*2] 784 ldp x10,x11,[x2,#8*4] 785 ldp x12,x13,[x2,#8*6] 786 cbz x27,.Lsqr8x_tail_break 787 788 ldr x4,[x0,#-8*8] 789 adds x19,x19,x6 790 adcs x20,x20,x7 791 ldp x6,x7,[x1,#8*0] 792 adcs x21,x21,x8 793 adcs x22,x22,x9 794 ldp x8,x9,[x1,#8*2] 795 adcs x23,x23,x10 796 adcs x24,x24,x11 797 ldp x10,x11,[x1,#8*4] 798 adcs x25,x25,x12 799 mov x27,#-8*8 800 adcs x26,x26,x13 801 ldp x12,x13,[x1,#8*6] 802 add x1,x1,#8*8 803 //adc x28,xzr,xzr // moved above 804 b .Lsqr8x_tail 805 806 .align 4 807 .Lsqr8x_tail_break: 808 ldr x4,[x29,#112] // pull n0 809 add x27,x2,#8*8 // end of current t[num] window 810 811 subs xzr,x30,#1 // "move" top-most carry to carry bit 812 adcs x14,x19,x6 813 adcs x15,x20,x7 814 ldp x19,x20,[x0,#8*0] 815 adcs x21,x21,x8 816 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 817 adcs x22,x22,x9 818 ldp x8,x9,[x16,#8*2] 819 adcs x23,x23,x10 820 adcs x24,x24,x11 821 ldp x10,x11,[x16,#8*4] 822 adcs x25,x25,x12 823 adcs x26,x26,x13 824 ldp x12,x13,[x16,#8*6] 825 add x1,x16,#8*8 826 adc x30,xzr,xzr // top-most carry 827 mul x28,x4,x19 828 stp x14,x15,[x2,#8*0] 829 stp x21,x22,[x2,#8*2] 830 ldp x21,x22,[x0,#8*2] 831 stp x23,x24,[x2,#8*4] 832 ldp x23,x24,[x0,#8*4] 833 cmp x27,x29 // did we hit the bottom? 834 stp x25,x26,[x2,#8*6] 835 mov x2,x0 // slide the window 836 ldp x25,x26,[x0,#8*6] 837 mov x27,#8 838 b.ne .Lsqr8x_reduction 839 840 // Final step. We see if result is larger than modulus, and 841 // if it is, subtract the modulus. But comparison implies 842 // subtraction. So we subtract modulus, see if it borrowed, 843 // and conditionally copy original value. 844 ldr x0,[x29,#96] // pull rp 845 add x2,x2,#8*8 846 subs x14,x19,x6 847 sbcs x15,x20,x7 848 sub x27,x5,#8*8 849 mov x3,x0 // x0 copy 850 851 .Lsqr8x_sub: 852 sbcs x16,x21,x8 853 ldp x6,x7,[x1,#8*0] 854 sbcs x17,x22,x9 855 stp x14,x15,[x0,#8*0] 856 sbcs x14,x23,x10 857 ldp x8,x9,[x1,#8*2] 858 sbcs x15,x24,x11 859 stp x16,x17,[x0,#8*2] 860 sbcs x16,x25,x12 861 ldp x10,x11,[x1,#8*4] 862 sbcs x17,x26,x13 863 ldp x12,x13,[x1,#8*6] 864 add x1,x1,#8*8 865 ldp x19,x20,[x2,#8*0] 866 sub x27,x27,#8*8 867 ldp x21,x22,[x2,#8*2] 868 ldp x23,x24,[x2,#8*4] 869 ldp x25,x26,[x2,#8*6] 870 add x2,x2,#8*8 871 stp x14,x15,[x0,#8*4] 872 sbcs x14,x19,x6 873 stp x16,x17,[x0,#8*6] 874 add x0,x0,#8*8 875 sbcs x15,x20,x7 876 cbnz x27,.Lsqr8x_sub 877 878 sbcs x16,x21,x8 879 mov x2,sp 880 add x1,sp,x5 881 ldp x6,x7,[x3,#8*0] 882 sbcs x17,x22,x9 883 stp x14,x15,[x0,#8*0] 884 sbcs x14,x23,x10 885 ldp x8,x9,[x3,#8*2] 886 sbcs x15,x24,x11 887 stp x16,x17,[x0,#8*2] 888 sbcs x16,x25,x12 889 ldp x19,x20,[x1,#8*0] 890 sbcs x17,x26,x13 891 ldp x21,x22,[x1,#8*2] 892 sbcs xzr,x30,xzr // did it borrow? 893 ldr x30,[x29,#8] // pull return address 894 stp x14,x15,[x0,#8*4] 895 stp x16,x17,[x0,#8*6] 896 897 sub x27,x5,#8*4 898 .Lsqr4x_cond_copy: 899 sub x27,x27,#8*4 900 csel x14,x19,x6,lo 901 stp xzr,xzr,[x2,#8*0] 902 csel x15,x20,x7,lo 903 ldp x6,x7,[x3,#8*4] 904 ldp x19,x20,[x1,#8*4] 905 csel x16,x21,x8,lo 906 stp xzr,xzr,[x2,#8*2] 907 add x2,x2,#8*4 908 csel x17,x22,x9,lo 909 ldp x8,x9,[x3,#8*6] 910 ldp x21,x22,[x1,#8*6] 911 add x1,x1,#8*4 912 stp x14,x15,[x3,#8*0] 913 stp x16,x17,[x3,#8*2] 914 add x3,x3,#8*4 915 stp xzr,xzr,[x1,#8*0] 916 stp xzr,xzr,[x1,#8*2] 917 cbnz x27,.Lsqr4x_cond_copy 918 919 csel x14,x19,x6,lo 920 stp xzr,xzr,[x2,#8*0] 921 csel x15,x20,x7,lo 922 stp xzr,xzr,[x2,#8*2] 923 csel x16,x21,x8,lo 924 csel x17,x22,x9,lo 925 stp x14,x15,[x3,#8*0] 926 stp x16,x17,[x3,#8*2] 927 928 b .Lsqr8x_done 929 930 .align 4 931 .Lsqr8x8_post_condition: 932 adc x28,xzr,xzr 933 ldr x30,[x29,#8] // pull return address 934 // x19-7,x28 hold result, x6-7 hold modulus 935 subs x6,x19,x6 936 ldr x1,[x29,#96] // pull rp 937 sbcs x7,x20,x7 938 stp xzr,xzr,[sp,#8*0] 939 sbcs x8,x21,x8 940 stp xzr,xzr,[sp,#8*2] 941 sbcs x9,x22,x9 942 stp xzr,xzr,[sp,#8*4] 943 sbcs x10,x23,x10 944 stp xzr,xzr,[sp,#8*6] 945 sbcs x11,x24,x11 946 stp xzr,xzr,[sp,#8*8] 947 sbcs x12,x25,x12 948 stp xzr,xzr,[sp,#8*10] 949 sbcs x13,x26,x13 950 stp xzr,xzr,[sp,#8*12] 951 sbcs x28,x28,xzr // did it borrow? 952 stp xzr,xzr,[sp,#8*14] 953 954 // x6-7 hold result-modulus 955 csel x6,x19,x6,lo 956 csel x7,x20,x7,lo 957 csel x8,x21,x8,lo 958 csel x9,x22,x9,lo 959 stp x6,x7,[x1,#8*0] 960 csel x10,x23,x10,lo 961 csel x11,x24,x11,lo 962 stp x8,x9,[x1,#8*2] 963 csel x12,x25,x12,lo 964 csel x13,x26,x13,lo 965 stp x10,x11,[x1,#8*4] 966 stp x12,x13,[x1,#8*6] 967 968 .Lsqr8x_done: 969 ldp x19,x20,[x29,#16] 970 mov sp,x29 971 ldp x21,x22,[x29,#32] 972 mov x0,#1 973 ldp x23,x24,[x29,#48] 974 ldp x25,x26,[x29,#64] 975 ldp x27,x28,[x29,#80] 976 ldr x29,[sp],#128 977 ret 978 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont 979 .type __bn_mul4x_mont,%function 980 .align 5 981 __bn_mul4x_mont: 982 stp x29,x30,[sp,#-128]! 983 add x29,sp,#0 984 stp x19,x20,[sp,#16] 985 stp x21,x22,[sp,#32] 986 stp x23,x24,[sp,#48] 987 stp x25,x26,[sp,#64] 988 stp x27,x28,[sp,#80] 989 990 sub x26,sp,x5,lsl#3 991 lsl x5,x5,#3 992 ldr x4,[x4] // *n0 993 sub sp,x26,#8*4 // alloca 994 995 add x10,x2,x5 996 add x27,x1,x5 997 stp x0,x10,[x29,#96] // offload rp and &b[num] 998 999 ldr x24,[x2,#8*0] // b[0] 1000 ldp x6,x7,[x1,#8*0] // a[0..3] 1001 ldp x8,x9,[x1,#8*2] 1002 add x1,x1,#8*4 1003 mov x19,xzr 1004 mov x20,xzr 1005 mov x21,xzr 1006 mov x22,xzr 1007 ldp x14,x15,[x3,#8*0] // n[0..3] 1008 ldp x16,x17,[x3,#8*2] 1009 adds x3,x3,#8*4 // clear carry bit 1010 mov x0,xzr 1011 mov x28,#0 1012 mov x26,sp 1013 1014 .Loop_mul4x_1st_reduction: 1015 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1016 adc x0,x0,xzr // modulo-scheduled 1017 mul x11,x7,x24 1018 add x28,x28,#8 1019 mul x12,x8,x24 1020 and x28,x28,#31 1021 mul x13,x9,x24 1022 adds x19,x19,x10 1023 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1024 adcs x20,x20,x11 1025 mul x25,x19,x4 // t[0]*n0 1026 adcs x21,x21,x12 1027 umulh x11,x7,x24 1028 adcs x22,x22,x13 1029 umulh x12,x8,x24 1030 adc x23,xzr,xzr 1031 umulh x13,x9,x24 1032 ldr x24,[x2,x28] // next b[i] (or b[0]) 1033 adds x20,x20,x10 1034 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1035 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1036 adcs x21,x21,x11 1037 mul x11,x15,x25 1038 adcs x22,x22,x12 1039 mul x12,x16,x25 1040 adc x23,x23,x13 // can't overflow 1041 mul x13,x17,x25 1042 // (*) adds xzr,x19,x10 1043 subs xzr,x19,#1 // (*) 1044 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1045 adcs x19,x20,x11 1046 umulh x11,x15,x25 1047 adcs x20,x21,x12 1048 umulh x12,x16,x25 1049 adcs x21,x22,x13 1050 umulh x13,x17,x25 1051 adcs x22,x23,x0 1052 adc x0,xzr,xzr 1053 adds x19,x19,x10 1054 sub x10,x27,x1 1055 adcs x20,x20,x11 1056 adcs x21,x21,x12 1057 adcs x22,x22,x13 1058 //adc x0,x0,xzr 1059 cbnz x28,.Loop_mul4x_1st_reduction 1060 1061 cbz x10,.Lmul4x4_post_condition 1062 1063 ldp x6,x7,[x1,#8*0] // a[4..7] 1064 ldp x8,x9,[x1,#8*2] 1065 add x1,x1,#8*4 1066 ldr x25,[sp] // a[0]*n0 1067 ldp x14,x15,[x3,#8*0] // n[4..7] 1068 ldp x16,x17,[x3,#8*2] 1069 add x3,x3,#8*4 1070 1071 .Loop_mul4x_1st_tail: 1072 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1073 adc x0,x0,xzr // modulo-scheduled 1074 mul x11,x7,x24 1075 add x28,x28,#8 1076 mul x12,x8,x24 1077 and x28,x28,#31 1078 mul x13,x9,x24 1079 adds x19,x19,x10 1080 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1081 adcs x20,x20,x11 1082 umulh x11,x7,x24 1083 adcs x21,x21,x12 1084 umulh x12,x8,x24 1085 adcs x22,x22,x13 1086 umulh x13,x9,x24 1087 adc x23,xzr,xzr 1088 ldr x24,[x2,x28] // next b[i] (or b[0]) 1089 adds x20,x20,x10 1090 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1091 adcs x21,x21,x11 1092 mul x11,x15,x25 1093 adcs x22,x22,x12 1094 mul x12,x16,x25 1095 adc x23,x23,x13 // can't overflow 1096 mul x13,x17,x25 1097 adds x19,x19,x10 1098 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1099 adcs x20,x20,x11 1100 umulh x11,x15,x25 1101 adcs x21,x21,x12 1102 umulh x12,x16,x25 1103 adcs x22,x22,x13 1104 adcs x23,x23,x0 1105 umulh x13,x17,x25 1106 adc x0,xzr,xzr 1107 ldr x25,[sp,x28] // next t[0]*n0 1108 str x19,[x26],#8 // result!!! 1109 adds x19,x20,x10 1110 sub x10,x27,x1 // done yet? 1111 adcs x20,x21,x11 1112 adcs x21,x22,x12 1113 adcs x22,x23,x13 1114 //adc x0,x0,xzr 1115 cbnz x28,.Loop_mul4x_1st_tail 1116 1117 sub x11,x27,x5 // rewinded x1 1118 cbz x10,.Lmul4x_proceed 1119 1120 ldp x6,x7,[x1,#8*0] 1121 ldp x8,x9,[x1,#8*2] 1122 add x1,x1,#8*4 1123 ldp x14,x15,[x3,#8*0] 1124 ldp x16,x17,[x3,#8*2] 1125 add x3,x3,#8*4 1126 b .Loop_mul4x_1st_tail 1127 1128 .align 5 1129 .Lmul4x_proceed: 1130 ldr x24,[x2,#8*4]! // *++b 1131 adc x30,x0,xzr 1132 ldp x6,x7,[x11,#8*0] // a[0..3] 1133 sub x3,x3,x5 // rewind np 1134 ldp x8,x9,[x11,#8*2] 1135 add x1,x11,#8*4 1136 1137 stp x19,x20,[x26,#8*0] // result!!! 1138 ldp x19,x20,[sp,#8*4] // t[0..3] 1139 stp x21,x22,[x26,#8*2] // result!!! 1140 ldp x21,x22,[sp,#8*6] 1141 1142 ldp x14,x15,[x3,#8*0] // n[0..3] 1143 mov x26,sp 1144 ldp x16,x17,[x3,#8*2] 1145 adds x3,x3,#8*4 // clear carry bit 1146 mov x0,xzr 1147 1148 .align 4 1149 .Loop_mul4x_reduction: 1150 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1151 adc x0,x0,xzr // modulo-scheduled 1152 mul x11,x7,x24 1153 add x28,x28,#8 1154 mul x12,x8,x24 1155 and x28,x28,#31 1156 mul x13,x9,x24 1157 adds x19,x19,x10 1158 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1159 adcs x20,x20,x11 1160 mul x25,x19,x4 // t[0]*n0 1161 adcs x21,x21,x12 1162 umulh x11,x7,x24 1163 adcs x22,x22,x13 1164 umulh x12,x8,x24 1165 adc x23,xzr,xzr 1166 umulh x13,x9,x24 1167 ldr x24,[x2,x28] // next b[i] 1168 adds x20,x20,x10 1169 // (*) mul x10,x14,x25 1170 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1171 adcs x21,x21,x11 1172 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1173 adcs x22,x22,x12 1174 mul x12,x16,x25 1175 adc x23,x23,x13 // can't overflow 1176 mul x13,x17,x25 1177 // (*) adds xzr,x19,x10 1178 subs xzr,x19,#1 // (*) 1179 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1180 adcs x19,x20,x11 1181 umulh x11,x15,x25 1182 adcs x20,x21,x12 1183 umulh x12,x16,x25 1184 adcs x21,x22,x13 1185 umulh x13,x17,x25 1186 adcs x22,x23,x0 1187 adc x0,xzr,xzr 1188 adds x19,x19,x10 1189 adcs x20,x20,x11 1190 adcs x21,x21,x12 1191 adcs x22,x22,x13 1192 //adc x0,x0,xzr 1193 cbnz x28,.Loop_mul4x_reduction 1194 1195 adc x0,x0,xzr 1196 ldp x10,x11,[x26,#8*4] // t[4..7] 1197 ldp x12,x13,[x26,#8*6] 1198 ldp x6,x7,[x1,#8*0] // a[4..7] 1199 ldp x8,x9,[x1,#8*2] 1200 add x1,x1,#8*4 1201 adds x19,x19,x10 1202 adcs x20,x20,x11 1203 adcs x21,x21,x12 1204 adcs x22,x22,x13 1205 //adc x0,x0,xzr 1206 1207 ldr x25,[sp] // t[0]*n0 1208 ldp x14,x15,[x3,#8*0] // n[4..7] 1209 ldp x16,x17,[x3,#8*2] 1210 add x3,x3,#8*4 1211 1212 .align 4 1213 .Loop_mul4x_tail: 1214 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1215 adc x0,x0,xzr // modulo-scheduled 1216 mul x11,x7,x24 1217 add x28,x28,#8 1218 mul x12,x8,x24 1219 and x28,x28,#31 1220 mul x13,x9,x24 1221 adds x19,x19,x10 1222 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1223 adcs x20,x20,x11 1224 umulh x11,x7,x24 1225 adcs x21,x21,x12 1226 umulh x12,x8,x24 1227 adcs x22,x22,x13 1228 umulh x13,x9,x24 1229 adc x23,xzr,xzr 1230 ldr x24,[x2,x28] // next b[i] 1231 adds x20,x20,x10 1232 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1233 adcs x21,x21,x11 1234 mul x11,x15,x25 1235 adcs x22,x22,x12 1236 mul x12,x16,x25 1237 adc x23,x23,x13 // can't overflow 1238 mul x13,x17,x25 1239 adds x19,x19,x10 1240 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1241 adcs x20,x20,x11 1242 umulh x11,x15,x25 1243 adcs x21,x21,x12 1244 umulh x12,x16,x25 1245 adcs x22,x22,x13 1246 umulh x13,x17,x25 1247 adcs x23,x23,x0 1248 ldr x25,[sp,x28] // next a[0]*n0 1249 adc x0,xzr,xzr 1250 str x19,[x26],#8 // result!!! 1251 adds x19,x20,x10 1252 sub x10,x27,x1 // done yet? 1253 adcs x20,x21,x11 1254 adcs x21,x22,x12 1255 adcs x22,x23,x13 1256 //adc x0,x0,xzr 1257 cbnz x28,.Loop_mul4x_tail 1258 1259 sub x11,x3,x5 // rewinded np? 1260 adc x0,x0,xzr 1261 cbz x10,.Loop_mul4x_break 1262 1263 ldp x10,x11,[x26,#8*4] 1264 ldp x12,x13,[x26,#8*6] 1265 ldp x6,x7,[x1,#8*0] 1266 ldp x8,x9,[x1,#8*2] 1267 add x1,x1,#8*4 1268 adds x19,x19,x10 1269 adcs x20,x20,x11 1270 adcs x21,x21,x12 1271 adcs x22,x22,x13 1272 //adc x0,x0,xzr 1273 ldp x14,x15,[x3,#8*0] 1274 ldp x16,x17,[x3,#8*2] 1275 add x3,x3,#8*4 1276 b .Loop_mul4x_tail 1277 1278 .align 4 1279 .Loop_mul4x_break: 1280 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1281 adds x19,x19,x30 1282 add x2,x2,#8*4 // bp++ 1283 adcs x20,x20,xzr 1284 sub x1,x1,x5 // rewind ap 1285 adcs x21,x21,xzr 1286 stp x19,x20,[x26,#8*0] // result!!! 1287 adcs x22,x22,xzr 1288 ldp x19,x20,[sp,#8*4] // t[0..3] 1289 adc x30,x0,xzr 1290 stp x21,x22,[x26,#8*2] // result!!! 1291 cmp x2,x13 // done yet? 1292 ldp x21,x22,[sp,#8*6] 1293 ldp x14,x15,[x11,#8*0] // n[0..3] 1294 ldp x16,x17,[x11,#8*2] 1295 add x3,x11,#8*4 1296 b.eq .Lmul4x_post 1297 1298 ldr x24,[x2] 1299 ldp x6,x7,[x1,#8*0] // a[0..3] 1300 ldp x8,x9,[x1,#8*2] 1301 adds x1,x1,#8*4 // clear carry bit 1302 mov x0,xzr 1303 mov x26,sp 1304 b .Loop_mul4x_reduction 1305 1306 .align 4 1307 .Lmul4x_post: 1308 // Final step. We see if result is larger than modulus, and 1309 // if it is, subtract the modulus. But comparison implies 1310 // subtraction. So we subtract modulus, see if it borrowed, 1311 // and conditionally copy original value. 1312 mov x0,x12 1313 mov x27,x12 // x0 copy 1314 subs x10,x19,x14 1315 add x26,sp,#8*8 1316 sbcs x11,x20,x15 1317 sub x28,x5,#8*4 1318 1319 .Lmul4x_sub: 1320 sbcs x12,x21,x16 1321 ldp x14,x15,[x3,#8*0] 1322 sub x28,x28,#8*4 1323 ldp x19,x20,[x26,#8*0] 1324 sbcs x13,x22,x17 1325 ldp x16,x17,[x3,#8*2] 1326 add x3,x3,#8*4 1327 ldp x21,x22,[x26,#8*2] 1328 add x26,x26,#8*4 1329 stp x10,x11,[x0,#8*0] 1330 sbcs x10,x19,x14 1331 stp x12,x13,[x0,#8*2] 1332 add x0,x0,#8*4 1333 sbcs x11,x20,x15 1334 cbnz x28,.Lmul4x_sub 1335 1336 sbcs x12,x21,x16 1337 mov x26,sp 1338 add x1,sp,#8*4 1339 ldp x6,x7,[x27,#8*0] 1340 sbcs x13,x22,x17 1341 stp x10,x11,[x0,#8*0] 1342 ldp x8,x9,[x27,#8*2] 1343 stp x12,x13,[x0,#8*2] 1344 ldp x19,x20,[x1,#8*0] 1345 ldp x21,x22,[x1,#8*2] 1346 sbcs xzr,x30,xzr // did it borrow? 1347 ldr x30,[x29,#8] // pull return address 1348 1349 sub x28,x5,#8*4 1350 .Lmul4x_cond_copy: 1351 sub x28,x28,#8*4 1352 csel x10,x19,x6,lo 1353 stp xzr,xzr,[x26,#8*0] 1354 csel x11,x20,x7,lo 1355 ldp x6,x7,[x27,#8*4] 1356 ldp x19,x20,[x1,#8*4] 1357 csel x12,x21,x8,lo 1358 stp xzr,xzr,[x26,#8*2] 1359 add x26,x26,#8*4 1360 csel x13,x22,x9,lo 1361 ldp x8,x9,[x27,#8*6] 1362 ldp x21,x22,[x1,#8*6] 1363 add x1,x1,#8*4 1364 stp x10,x11,[x27,#8*0] 1365 stp x12,x13,[x27,#8*2] 1366 add x27,x27,#8*4 1367 cbnz x28,.Lmul4x_cond_copy 1368 1369 csel x10,x19,x6,lo 1370 stp xzr,xzr,[x26,#8*0] 1371 csel x11,x20,x7,lo 1372 stp xzr,xzr,[x26,#8*2] 1373 csel x12,x21,x8,lo 1374 stp xzr,xzr,[x26,#8*3] 1375 csel x13,x22,x9,lo 1376 stp xzr,xzr,[x26,#8*4] 1377 stp x10,x11,[x27,#8*0] 1378 stp x12,x13,[x27,#8*2] 1379 1380 b .Lmul4x_done 1381 1382 .align 4 1383 .Lmul4x4_post_condition: 1384 adc x0,x0,xzr 1385 ldr x1,[x29,#96] // pull rp 1386 // x19-3,x0 hold result, x14-7 hold modulus 1387 subs x6,x19,x14 1388 ldr x30,[x29,#8] // pull return address 1389 sbcs x7,x20,x15 1390 stp xzr,xzr,[sp,#8*0] 1391 sbcs x8,x21,x16 1392 stp xzr,xzr,[sp,#8*2] 1393 sbcs x9,x22,x17 1394 stp xzr,xzr,[sp,#8*4] 1395 sbcs xzr,x0,xzr // did it borrow? 1396 stp xzr,xzr,[sp,#8*6] 1397 1398 // x6-3 hold result-modulus 1399 csel x6,x19,x6,lo 1400 csel x7,x20,x7,lo 1401 csel x8,x21,x8,lo 1402 csel x9,x22,x9,lo 1403 stp x6,x7,[x1,#8*0] 1404 stp x8,x9,[x1,#8*2] 1405 1406 .Lmul4x_done: 1407 ldp x19,x20,[x29,#16] 1408 mov sp,x29 1409 ldp x21,x22,[x29,#32] 1410 mov x0,#1 1411 ldp x23,x24,[x29,#48] 1412 ldp x25,x26,[x29,#64] 1413 ldp x27,x28,[x29,#80] 1414 ldr x29,[sp],#128 1415 ret 1416 .size __bn_mul4x_mont,.-__bn_mul4x_mont 1417 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1418 .align 2 1419 .align 4 1420 #endif 1421 #endif // !OPENSSL_NO_ASM 1422