1 #! /usr/bin/env perl 2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # ==================================================================== 16 17 # March 2015 18 # 19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more 20 # work. While it does improve RSA sign performance by 20-30% (less for 21 # longer keys) on most processors, for some reason RSA2048 is not 22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 23 # instruction issue rate is limited on processor in question, meaning 24 # that dedicated squaring procedure is a must. Well, actually all 25 # contemporary AArch64 processors seem to have limited multiplication 26 # issue rate, i.e. they can't issue multiplication every cycle, which 27 # explains moderate improvement coefficients in comparison to 28 # compiler-generated code. Recall that compiler is instructed to use 29 # umulh and therefore uses same amount of multiplication instructions 30 # to do the job. Assembly's edge is to minimize number of "collateral" 31 # instructions and of course instruction scheduling. 32 # 33 # April 2015 34 # 35 # Squaring procedure that handles lengths divisible by 8 improves 36 # RSA/DSA performance by 25-40-60% depending on processor and key 37 # length. Overall improvement coefficients are always positive in 38 # comparison to compiler-generated code. On Cortex-A57 improvement 39 # is still modest on longest key lengths, while others exhibit e.g. 40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 41 # on Cortex-A57 and ~60-100% faster on others. 42 43 $flavour = shift; 44 $output = shift; 45 46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 49 die "can't locate arm-xlate.pl"; 50 51 open OUT,"| \"$^X\" $xlate $flavour $output"; 52 *STDOUT=*OUT; 53 54 ($lo0,$hi0,$aj,$m0,$alo,$ahi, 55 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 57 58 # int bn_mul_mont( 59 $rp="x0"; # BN_ULONG *rp, 60 $ap="x1"; # const BN_ULONG *ap, 61 $bp="x2"; # const BN_ULONG *bp, 62 $np="x3"; # const BN_ULONG *np, 63 $n0="x4"; # const BN_ULONG *n0, 64 $num="x5"; # int num); 65 66 $code.=<<___; 67 .text 68 69 .globl bn_mul_mont 70 .type bn_mul_mont,%function 71 .align 5 72 bn_mul_mont: 73 tst $num,#7 74 b.eq __bn_sqr8x_mont 75 tst $num,#3 76 b.eq __bn_mul4x_mont 77 .Lmul_mont: 78 stp x29,x30,[sp,#-64]! 79 add x29,sp,#0 80 stp x19,x20,[sp,#16] 81 stp x21,x22,[sp,#32] 82 stp x23,x24,[sp,#48] 83 84 ldr $m0,[$bp],#8 // bp[0] 85 sub $tp,sp,$num,lsl#3 86 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 87 lsl $num,$num,#3 88 ldr $n0,[$n0] // *n0 89 and $tp,$tp,#-16 // ABI says so 90 ldp $hi1,$nj,[$np],#16 // np[0..1] 91 92 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 93 sub $j,$num,#16 // j=num-2 94 umulh $hi0,$hi0,$m0 95 mul $alo,$aj,$m0 // ap[1]*bp[0] 96 umulh $ahi,$aj,$m0 97 98 mul $m1,$lo0,$n0 // "tp[0]"*n0 99 mov sp,$tp // alloca 100 101 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 102 umulh $hi1,$hi1,$m1 103 mul $nlo,$nj,$m1 // np[1]*m1 104 // (*) adds $lo1,$lo1,$lo0 // discarded 105 // (*) As for removal of first multiplication and addition 106 // instructions. The outcome of first addition is 107 // guaranteed to be zero, which leaves two computationally 108 // significant outcomes: it either carries or not. Then 109 // question is when does it carry? Is there alternative 110 // way to deduce it? If you follow operations, you can 111 // observe that condition for carry is quite simple: 112 // $lo0 being non-zero. So that carry can be calculated 113 // by adding -1 to $lo0. That's what next instruction does. 114 subs xzr,$lo0,#1 // (*) 115 umulh $nhi,$nj,$m1 116 adc $hi1,$hi1,xzr 117 cbz $j,.L1st_skip 118 119 .L1st: 120 ldr $aj,[$ap],#8 121 adds $lo0,$alo,$hi0 122 sub $j,$j,#8 // j-- 123 adc $hi0,$ahi,xzr 124 125 ldr $nj,[$np],#8 126 adds $lo1,$nlo,$hi1 127 mul $alo,$aj,$m0 // ap[j]*bp[0] 128 adc $hi1,$nhi,xzr 129 umulh $ahi,$aj,$m0 130 131 adds $lo1,$lo1,$lo0 132 mul $nlo,$nj,$m1 // np[j]*m1 133 adc $hi1,$hi1,xzr 134 umulh $nhi,$nj,$m1 135 str $lo1,[$tp],#8 // tp[j-1] 136 cbnz $j,.L1st 137 138 .L1st_skip: 139 adds $lo0,$alo,$hi0 140 sub $ap,$ap,$num // rewind $ap 141 adc $hi0,$ahi,xzr 142 143 adds $lo1,$nlo,$hi1 144 sub $np,$np,$num // rewind $np 145 adc $hi1,$nhi,xzr 146 147 adds $lo1,$lo1,$lo0 148 sub $i,$num,#8 // i=num-1 149 adcs $hi1,$hi1,$hi0 150 151 adc $ovf,xzr,xzr // upmost overflow bit 152 stp $lo1,$hi1,[$tp] 153 154 .Louter: 155 ldr $m0,[$bp],#8 // bp[i] 156 ldp $hi0,$aj,[$ap],#16 157 ldr $tj,[sp] // tp[0] 158 add $tp,sp,#8 159 160 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 161 sub $j,$num,#16 // j=num-2 162 umulh $hi0,$hi0,$m0 163 ldp $hi1,$nj,[$np],#16 164 mul $alo,$aj,$m0 // ap[1]*bp[i] 165 adds $lo0,$lo0,$tj 166 umulh $ahi,$aj,$m0 167 adc $hi0,$hi0,xzr 168 169 mul $m1,$lo0,$n0 170 sub $i,$i,#8 // i-- 171 172 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 173 umulh $hi1,$hi1,$m1 174 mul $nlo,$nj,$m1 // np[1]*m1 175 // (*) adds $lo1,$lo1,$lo0 176 subs xzr,$lo0,#1 // (*) 177 umulh $nhi,$nj,$m1 178 cbz $j,.Linner_skip 179 180 .Linner: 181 ldr $aj,[$ap],#8 182 adc $hi1,$hi1,xzr 183 ldr $tj,[$tp],#8 // tp[j] 184 adds $lo0,$alo,$hi0 185 sub $j,$j,#8 // j-- 186 adc $hi0,$ahi,xzr 187 188 adds $lo1,$nlo,$hi1 189 ldr $nj,[$np],#8 190 adc $hi1,$nhi,xzr 191 192 mul $alo,$aj,$m0 // ap[j]*bp[i] 193 adds $lo0,$lo0,$tj 194 umulh $ahi,$aj,$m0 195 adc $hi0,$hi0,xzr 196 197 mul $nlo,$nj,$m1 // np[j]*m1 198 adds $lo1,$lo1,$lo0 199 umulh $nhi,$nj,$m1 200 str $lo1,[$tp,#-16] // tp[j-1] 201 cbnz $j,.Linner 202 203 .Linner_skip: 204 ldr $tj,[$tp],#8 // tp[j] 205 adc $hi1,$hi1,xzr 206 adds $lo0,$alo,$hi0 207 sub $ap,$ap,$num // rewind $ap 208 adc $hi0,$ahi,xzr 209 210 adds $lo1,$nlo,$hi1 211 sub $np,$np,$num // rewind $np 212 adcs $hi1,$nhi,$ovf 213 adc $ovf,xzr,xzr 214 215 adds $lo0,$lo0,$tj 216 adc $hi0,$hi0,xzr 217 218 adds $lo1,$lo1,$lo0 219 adcs $hi1,$hi1,$hi0 220 adc $ovf,$ovf,xzr // upmost overflow bit 221 stp $lo1,$hi1,[$tp,#-16] 222 223 cbnz $i,.Louter 224 225 // Final step. We see if result is larger than modulus, and 226 // if it is, subtract the modulus. But comparison implies 227 // subtraction. So we subtract modulus, see if it borrowed, 228 // and conditionally copy original value. 229 ldr $tj,[sp] // tp[0] 230 add $tp,sp,#8 231 ldr $nj,[$np],#8 // np[0] 232 subs $j,$num,#8 // j=num-1 and clear borrow 233 mov $ap,$rp 234 .Lsub: 235 sbcs $aj,$tj,$nj // tp[j]-np[j] 236 ldr $tj,[$tp],#8 237 sub $j,$j,#8 // j-- 238 ldr $nj,[$np],#8 239 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 240 cbnz $j,.Lsub 241 242 sbcs $aj,$tj,$nj 243 sbcs $ovf,$ovf,xzr // did it borrow? 244 str $aj,[$ap],#8 // rp[num-1] 245 246 ldr $tj,[sp] // tp[0] 247 add $tp,sp,#8 248 ldr $aj,[$rp],#8 // rp[0] 249 sub $num,$num,#8 // num-- 250 nop 251 .Lcond_copy: 252 sub $num,$num,#8 // num-- 253 csel $nj,$tj,$aj,lo // did it borrow? 254 ldr $tj,[$tp],#8 255 ldr $aj,[$rp],#8 256 str xzr,[$tp,#-16] // wipe tp 257 str $nj,[$rp,#-16] 258 cbnz $num,.Lcond_copy 259 260 csel $nj,$tj,$aj,lo 261 str xzr,[$tp,#-8] // wipe tp 262 str $nj,[$rp,#-8] 263 264 ldp x19,x20,[x29,#16] 265 mov sp,x29 266 ldp x21,x22,[x29,#32] 267 mov x0,#1 268 ldp x23,x24,[x29,#48] 269 ldr x29,[sp],#64 270 ret 271 .size bn_mul_mont,.-bn_mul_mont 272 ___ 273 { 274 ######################################################################## 275 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 276 277 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 278 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 279 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 280 my ($cnt,$carry,$topmost)=("x27","x28","x30"); 281 my ($tp,$ap_end,$na0)=($bp,$np,$carry); 282 283 $code.=<<___; 284 .type __bn_sqr8x_mont,%function 285 .align 5 286 __bn_sqr8x_mont: 287 cmp $ap,$bp 288 b.ne __bn_mul4x_mont 289 .Lsqr8x_mont: 290 stp x29,x30,[sp,#-128]! 291 add x29,sp,#0 292 stp x19,x20,[sp,#16] 293 stp x21,x22,[sp,#32] 294 stp x23,x24,[sp,#48] 295 stp x25,x26,[sp,#64] 296 stp x27,x28,[sp,#80] 297 stp $rp,$np,[sp,#96] // offload rp and np 298 299 ldp $a0,$a1,[$ap,#8*0] 300 ldp $a2,$a3,[$ap,#8*2] 301 ldp $a4,$a5,[$ap,#8*4] 302 ldp $a6,$a7,[$ap,#8*6] 303 304 sub $tp,sp,$num,lsl#4 305 lsl $num,$num,#3 306 ldr $n0,[$n0] // *n0 307 mov sp,$tp // alloca 308 sub $cnt,$num,#8*8 309 b .Lsqr8x_zero_start 310 311 .Lsqr8x_zero: 312 sub $cnt,$cnt,#8*8 313 stp xzr,xzr,[$tp,#8*0] 314 stp xzr,xzr,[$tp,#8*2] 315 stp xzr,xzr,[$tp,#8*4] 316 stp xzr,xzr,[$tp,#8*6] 317 .Lsqr8x_zero_start: 318 stp xzr,xzr,[$tp,#8*8] 319 stp xzr,xzr,[$tp,#8*10] 320 stp xzr,xzr,[$tp,#8*12] 321 stp xzr,xzr,[$tp,#8*14] 322 add $tp,$tp,#8*16 323 cbnz $cnt,.Lsqr8x_zero 324 325 add $ap_end,$ap,$num 326 add $ap,$ap,#8*8 327 mov $acc0,xzr 328 mov $acc1,xzr 329 mov $acc2,xzr 330 mov $acc3,xzr 331 mov $acc4,xzr 332 mov $acc5,xzr 333 mov $acc6,xzr 334 mov $acc7,xzr 335 mov $tp,sp 336 str $n0,[x29,#112] // offload n0 337 338 // Multiply everything but a[i]*a[i] 339 .align 4 340 .Lsqr8x_outer_loop: 341 // a[1]a[0] (i) 342 // a[2]a[0] 343 // a[3]a[0] 344 // a[4]a[0] 345 // a[5]a[0] 346 // a[6]a[0] 347 // a[7]a[0] 348 // a[2]a[1] (ii) 349 // a[3]a[1] 350 // a[4]a[1] 351 // a[5]a[1] 352 // a[6]a[1] 353 // a[7]a[1] 354 // a[3]a[2] (iii) 355 // a[4]a[2] 356 // a[5]a[2] 357 // a[6]a[2] 358 // a[7]a[2] 359 // a[4]a[3] (iv) 360 // a[5]a[3] 361 // a[6]a[3] 362 // a[7]a[3] 363 // a[5]a[4] (v) 364 // a[6]a[4] 365 // a[7]a[4] 366 // a[6]a[5] (vi) 367 // a[7]a[5] 368 // a[7]a[6] (vii) 369 370 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 371 mul $t1,$a2,$a0 372 mul $t2,$a3,$a0 373 mul $t3,$a4,$a0 374 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 375 mul $t0,$a5,$a0 376 adcs $acc2,$acc2,$t1 377 mul $t1,$a6,$a0 378 adcs $acc3,$acc3,$t2 379 mul $t2,$a7,$a0 380 adcs $acc4,$acc4,$t3 381 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 382 adcs $acc5,$acc5,$t0 383 umulh $t0,$a2,$a0 384 adcs $acc6,$acc6,$t1 385 umulh $t1,$a3,$a0 386 adcs $acc7,$acc7,$t2 387 umulh $t2,$a4,$a0 388 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 389 adc $acc0,xzr,xzr // t[8] 390 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 391 umulh $t3,$a5,$a0 392 adcs $acc3,$acc3,$t0 393 umulh $t0,$a6,$a0 394 adcs $acc4,$acc4,$t1 395 umulh $t1,$a7,$a0 396 adcs $acc5,$acc5,$t2 397 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 398 adcs $acc6,$acc6,$t3 399 mul $t3,$a3,$a1 400 adcs $acc7,$acc7,$t0 401 mul $t0,$a4,$a1 402 adc $acc0,$acc0,$t1 403 404 mul $t1,$a5,$a1 405 adds $acc3,$acc3,$t2 406 mul $t2,$a6,$a1 407 adcs $acc4,$acc4,$t3 408 mul $t3,$a7,$a1 409 adcs $acc5,$acc5,$t0 410 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 411 adcs $acc6,$acc6,$t1 412 umulh $t1,$a3,$a1 413 adcs $acc7,$acc7,$t2 414 umulh $t2,$a4,$a1 415 adcs $acc0,$acc0,$t3 416 umulh $t3,$a5,$a1 417 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 418 adc $acc1,xzr,xzr // t[9] 419 adds $acc4,$acc4,$t0 420 umulh $t0,$a6,$a1 421 adcs $acc5,$acc5,$t1 422 umulh $t1,$a7,$a1 423 adcs $acc6,$acc6,$t2 424 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 425 adcs $acc7,$acc7,$t3 426 mul $t3,$a4,$a2 427 adcs $acc0,$acc0,$t0 428 mul $t0,$a5,$a2 429 adc $acc1,$acc1,$t1 430 431 mul $t1,$a6,$a2 432 adds $acc5,$acc5,$t2 433 mul $t2,$a7,$a2 434 adcs $acc6,$acc6,$t3 435 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 436 adcs $acc7,$acc7,$t0 437 umulh $t0,$a4,$a2 438 adcs $acc0,$acc0,$t1 439 umulh $t1,$a5,$a2 440 adcs $acc1,$acc1,$t2 441 umulh $t2,$a6,$a2 442 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 443 adc $acc2,xzr,xzr // t[10] 444 adds $acc6,$acc6,$t3 445 umulh $t3,$a7,$a2 446 adcs $acc7,$acc7,$t0 447 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 448 adcs $acc0,$acc0,$t1 449 mul $t1,$a5,$a3 450 adcs $acc1,$acc1,$t2 451 mul $t2,$a6,$a3 452 adc $acc2,$acc2,$t3 453 454 mul $t3,$a7,$a3 455 adds $acc7,$acc7,$t0 456 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 457 adcs $acc0,$acc0,$t1 458 umulh $t1,$a5,$a3 459 adcs $acc1,$acc1,$t2 460 umulh $t2,$a6,$a3 461 adcs $acc2,$acc2,$t3 462 umulh $t3,$a7,$a3 463 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 464 adc $acc3,xzr,xzr // t[11] 465 adds $acc0,$acc0,$t0 466 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 467 adcs $acc1,$acc1,$t1 468 mul $t1,$a6,$a4 469 adcs $acc2,$acc2,$t2 470 mul $t2,$a7,$a4 471 adc $acc3,$acc3,$t3 472 473 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 474 adds $acc1,$acc1,$t0 475 umulh $t0,$a6,$a4 476 adcs $acc2,$acc2,$t1 477 umulh $t1,$a7,$a4 478 adcs $acc3,$acc3,$t2 479 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 480 adc $acc4,xzr,xzr // t[12] 481 adds $acc2,$acc2,$t3 482 mul $t3,$a7,$a5 483 adcs $acc3,$acc3,$t0 484 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 485 adc $acc4,$acc4,$t1 486 487 umulh $t1,$a7,$a5 488 adds $acc3,$acc3,$t2 489 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 490 adcs $acc4,$acc4,$t3 491 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 492 adc $acc5,xzr,xzr // t[13] 493 adds $acc4,$acc4,$t0 494 sub $cnt,$ap_end,$ap // done yet? 495 adc $acc5,$acc5,$t1 496 497 adds $acc5,$acc5,$t2 498 sub $t0,$ap_end,$num // rewinded ap 499 adc $acc6,xzr,xzr // t[14] 500 add $acc6,$acc6,$t3 501 502 cbz $cnt,.Lsqr8x_outer_break 503 504 mov $n0,$a0 505 ldp $a0,$a1,[$tp,#8*0] 506 ldp $a2,$a3,[$tp,#8*2] 507 ldp $a4,$a5,[$tp,#8*4] 508 ldp $a6,$a7,[$tp,#8*6] 509 adds $acc0,$acc0,$a0 510 adcs $acc1,$acc1,$a1 511 ldp $a0,$a1,[$ap,#8*0] 512 adcs $acc2,$acc2,$a2 513 adcs $acc3,$acc3,$a3 514 ldp $a2,$a3,[$ap,#8*2] 515 adcs $acc4,$acc4,$a4 516 adcs $acc5,$acc5,$a5 517 ldp $a4,$a5,[$ap,#8*4] 518 adcs $acc6,$acc6,$a6 519 mov $rp,$ap 520 adcs $acc7,xzr,$a7 521 ldp $a6,$a7,[$ap,#8*6] 522 add $ap,$ap,#8*8 523 //adc $carry,xzr,xzr // moved below 524 mov $cnt,#-8*8 525 526 // a[8]a[0] 527 // a[9]a[0] 528 // a[a]a[0] 529 // a[b]a[0] 530 // a[c]a[0] 531 // a[d]a[0] 532 // a[e]a[0] 533 // a[f]a[0] 534 // a[8]a[1] 535 // a[f]a[1]........................ 536 // a[8]a[2] 537 // a[f]a[2]........................ 538 // a[8]a[3] 539 // a[f]a[3]........................ 540 // a[8]a[4] 541 // a[f]a[4]........................ 542 // a[8]a[5] 543 // a[f]a[5]........................ 544 // a[8]a[6] 545 // a[f]a[6]........................ 546 // a[8]a[7] 547 // a[f]a[7]........................ 548 .Lsqr8x_mul: 549 mul $t0,$a0,$n0 550 adc $carry,xzr,xzr // carry bit, modulo-scheduled 551 mul $t1,$a1,$n0 552 add $cnt,$cnt,#8 553 mul $t2,$a2,$n0 554 mul $t3,$a3,$n0 555 adds $acc0,$acc0,$t0 556 mul $t0,$a4,$n0 557 adcs $acc1,$acc1,$t1 558 mul $t1,$a5,$n0 559 adcs $acc2,$acc2,$t2 560 mul $t2,$a6,$n0 561 adcs $acc3,$acc3,$t3 562 mul $t3,$a7,$n0 563 adcs $acc4,$acc4,$t0 564 umulh $t0,$a0,$n0 565 adcs $acc5,$acc5,$t1 566 umulh $t1,$a1,$n0 567 adcs $acc6,$acc6,$t2 568 umulh $t2,$a2,$n0 569 adcs $acc7,$acc7,$t3 570 umulh $t3,$a3,$n0 571 adc $carry,$carry,xzr 572 str $acc0,[$tp],#8 573 adds $acc0,$acc1,$t0 574 umulh $t0,$a4,$n0 575 adcs $acc1,$acc2,$t1 576 umulh $t1,$a5,$n0 577 adcs $acc2,$acc3,$t2 578 umulh $t2,$a6,$n0 579 adcs $acc3,$acc4,$t3 580 umulh $t3,$a7,$n0 581 ldr $n0,[$rp,$cnt] 582 adcs $acc4,$acc5,$t0 583 adcs $acc5,$acc6,$t1 584 adcs $acc6,$acc7,$t2 585 adcs $acc7,$carry,$t3 586 //adc $carry,xzr,xzr // moved above 587 cbnz $cnt,.Lsqr8x_mul 588 // note that carry flag is guaranteed 589 // to be zero at this point 590 cmp $ap,$ap_end // done yet? 591 b.eq .Lsqr8x_break 592 593 ldp $a0,$a1,[$tp,#8*0] 594 ldp $a2,$a3,[$tp,#8*2] 595 ldp $a4,$a5,[$tp,#8*4] 596 ldp $a6,$a7,[$tp,#8*6] 597 adds $acc0,$acc0,$a0 598 ldr $n0,[$rp,#-8*8] 599 adcs $acc1,$acc1,$a1 600 ldp $a0,$a1,[$ap,#8*0] 601 adcs $acc2,$acc2,$a2 602 adcs $acc3,$acc3,$a3 603 ldp $a2,$a3,[$ap,#8*2] 604 adcs $acc4,$acc4,$a4 605 adcs $acc5,$acc5,$a5 606 ldp $a4,$a5,[$ap,#8*4] 607 adcs $acc6,$acc6,$a6 608 mov $cnt,#-8*8 609 adcs $acc7,$acc7,$a7 610 ldp $a6,$a7,[$ap,#8*6] 611 add $ap,$ap,#8*8 612 //adc $carry,xzr,xzr // moved above 613 b .Lsqr8x_mul 614 615 .align 4 616 .Lsqr8x_break: 617 ldp $a0,$a1,[$rp,#8*0] 618 add $ap,$rp,#8*8 619 ldp $a2,$a3,[$rp,#8*2] 620 sub $t0,$ap_end,$ap // is it last iteration? 621 ldp $a4,$a5,[$rp,#8*4] 622 sub $t1,$tp,$t0 623 ldp $a6,$a7,[$rp,#8*6] 624 cbz $t0,.Lsqr8x_outer_loop 625 626 stp $acc0,$acc1,[$tp,#8*0] 627 ldp $acc0,$acc1,[$t1,#8*0] 628 stp $acc2,$acc3,[$tp,#8*2] 629 ldp $acc2,$acc3,[$t1,#8*2] 630 stp $acc4,$acc5,[$tp,#8*4] 631 ldp $acc4,$acc5,[$t1,#8*4] 632 stp $acc6,$acc7,[$tp,#8*6] 633 mov $tp,$t1 634 ldp $acc6,$acc7,[$t1,#8*6] 635 b .Lsqr8x_outer_loop 636 637 .align 4 638 .Lsqr8x_outer_break: 639 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 640 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 641 ldp $t1,$t2,[sp,#8*1] 642 ldp $a5,$a7,[$t0,#8*2] 643 add $ap,$t0,#8*4 644 ldp $t3,$t0,[sp,#8*3] 645 646 stp $acc0,$acc1,[$tp,#8*0] 647 mul $acc0,$a1,$a1 648 stp $acc2,$acc3,[$tp,#8*2] 649 umulh $a1,$a1,$a1 650 stp $acc4,$acc5,[$tp,#8*4] 651 mul $a2,$a3,$a3 652 stp $acc6,$acc7,[$tp,#8*6] 653 mov $tp,sp 654 umulh $a3,$a3,$a3 655 adds $acc1,$a1,$t1,lsl#1 656 extr $t1,$t2,$t1,#63 657 sub $cnt,$num,#8*4 658 659 .Lsqr4x_shift_n_add: 660 adcs $acc2,$a2,$t1 661 extr $t2,$t3,$t2,#63 662 sub $cnt,$cnt,#8*4 663 adcs $acc3,$a3,$t2 664 ldp $t1,$t2,[$tp,#8*5] 665 mul $a4,$a5,$a5 666 ldp $a1,$a3,[$ap],#8*2 667 umulh $a5,$a5,$a5 668 mul $a6,$a7,$a7 669 umulh $a7,$a7,$a7 670 extr $t3,$t0,$t3,#63 671 stp $acc0,$acc1,[$tp,#8*0] 672 adcs $acc4,$a4,$t3 673 extr $t0,$t1,$t0,#63 674 stp $acc2,$acc3,[$tp,#8*2] 675 adcs $acc5,$a5,$t0 676 ldp $t3,$t0,[$tp,#8*7] 677 extr $t1,$t2,$t1,#63 678 adcs $acc6,$a6,$t1 679 extr $t2,$t3,$t2,#63 680 adcs $acc7,$a7,$t2 681 ldp $t1,$t2,[$tp,#8*9] 682 mul $a0,$a1,$a1 683 ldp $a5,$a7,[$ap],#8*2 684 umulh $a1,$a1,$a1 685 mul $a2,$a3,$a3 686 umulh $a3,$a3,$a3 687 stp $acc4,$acc5,[$tp,#8*4] 688 extr $t3,$t0,$t3,#63 689 stp $acc6,$acc7,[$tp,#8*6] 690 add $tp,$tp,#8*8 691 adcs $acc0,$a0,$t3 692 extr $t0,$t1,$t0,#63 693 adcs $acc1,$a1,$t0 694 ldp $t3,$t0,[$tp,#8*3] 695 extr $t1,$t2,$t1,#63 696 cbnz $cnt,.Lsqr4x_shift_n_add 697 ___ 698 my ($np,$np_end)=($ap,$ap_end); 699 $code.=<<___; 700 ldp $np,$n0,[x29,#104] // pull np and n0 701 702 adcs $acc2,$a2,$t1 703 extr $t2,$t3,$t2,#63 704 adcs $acc3,$a3,$t2 705 ldp $t1,$t2,[$tp,#8*5] 706 mul $a4,$a5,$a5 707 umulh $a5,$a5,$a5 708 stp $acc0,$acc1,[$tp,#8*0] 709 mul $a6,$a7,$a7 710 umulh $a7,$a7,$a7 711 stp $acc2,$acc3,[$tp,#8*2] 712 extr $t3,$t0,$t3,#63 713 adcs $acc4,$a4,$t3 714 extr $t0,$t1,$t0,#63 715 ldp $acc0,$acc1,[sp,#8*0] 716 adcs $acc5,$a5,$t0 717 extr $t1,$t2,$t1,#63 718 ldp $a0,$a1,[$np,#8*0] 719 adcs $acc6,$a6,$t1 720 extr $t2,xzr,$t2,#63 721 ldp $a2,$a3,[$np,#8*2] 722 adc $acc7,$a7,$t2 723 ldp $a4,$a5,[$np,#8*4] 724 725 // Reduce by 512 bits per iteration 726 mul $na0,$n0,$acc0 // t[0]*n0 727 ldp $a6,$a7,[$np,#8*6] 728 add $np_end,$np,$num 729 ldp $acc2,$acc3,[sp,#8*2] 730 stp $acc4,$acc5,[$tp,#8*4] 731 ldp $acc4,$acc5,[sp,#8*4] 732 stp $acc6,$acc7,[$tp,#8*6] 733 ldp $acc6,$acc7,[sp,#8*6] 734 add $np,$np,#8*8 735 mov $topmost,xzr // initial top-most carry 736 mov $tp,sp 737 mov $cnt,#8 738 739 .Lsqr8x_reduction: 740 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 741 mul $t1,$a1,$na0 742 sub $cnt,$cnt,#1 743 mul $t2,$a2,$na0 744 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 745 mul $t3,$a3,$na0 746 // (*) adds xzr,$acc0,$t0 747 subs xzr,$acc0,#1 // (*) 748 mul $t0,$a4,$na0 749 adcs $acc0,$acc1,$t1 750 mul $t1,$a5,$na0 751 adcs $acc1,$acc2,$t2 752 mul $t2,$a6,$na0 753 adcs $acc2,$acc3,$t3 754 mul $t3,$a7,$na0 755 adcs $acc3,$acc4,$t0 756 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 757 adcs $acc4,$acc5,$t1 758 umulh $t1,$a1,$na0 759 adcs $acc5,$acc6,$t2 760 umulh $t2,$a2,$na0 761 adcs $acc6,$acc7,$t3 762 umulh $t3,$a3,$na0 763 adc $acc7,xzr,xzr 764 adds $acc0,$acc0,$t0 765 umulh $t0,$a4,$na0 766 adcs $acc1,$acc1,$t1 767 umulh $t1,$a5,$na0 768 adcs $acc2,$acc2,$t2 769 umulh $t2,$a6,$na0 770 adcs $acc3,$acc3,$t3 771 umulh $t3,$a7,$na0 772 mul $na0,$n0,$acc0 // next t[0]*n0 773 adcs $acc4,$acc4,$t0 774 adcs $acc5,$acc5,$t1 775 adcs $acc6,$acc6,$t2 776 adc $acc7,$acc7,$t3 777 cbnz $cnt,.Lsqr8x_reduction 778 779 ldp $t0,$t1,[$tp,#8*0] 780 ldp $t2,$t3,[$tp,#8*2] 781 mov $rp,$tp 782 sub $cnt,$np_end,$np // done yet? 783 adds $acc0,$acc0,$t0 784 adcs $acc1,$acc1,$t1 785 ldp $t0,$t1,[$tp,#8*4] 786 adcs $acc2,$acc2,$t2 787 adcs $acc3,$acc3,$t3 788 ldp $t2,$t3,[$tp,#8*6] 789 adcs $acc4,$acc4,$t0 790 adcs $acc5,$acc5,$t1 791 adcs $acc6,$acc6,$t2 792 adcs $acc7,$acc7,$t3 793 //adc $carry,xzr,xzr // moved below 794 cbz $cnt,.Lsqr8x8_post_condition 795 796 ldr $n0,[$tp,#-8*8] 797 ldp $a0,$a1,[$np,#8*0] 798 ldp $a2,$a3,[$np,#8*2] 799 ldp $a4,$a5,[$np,#8*4] 800 mov $cnt,#-8*8 801 ldp $a6,$a7,[$np,#8*6] 802 add $np,$np,#8*8 803 804 .Lsqr8x_tail: 805 mul $t0,$a0,$n0 806 adc $carry,xzr,xzr // carry bit, modulo-scheduled 807 mul $t1,$a1,$n0 808 add $cnt,$cnt,#8 809 mul $t2,$a2,$n0 810 mul $t3,$a3,$n0 811 adds $acc0,$acc0,$t0 812 mul $t0,$a4,$n0 813 adcs $acc1,$acc1,$t1 814 mul $t1,$a5,$n0 815 adcs $acc2,$acc2,$t2 816 mul $t2,$a6,$n0 817 adcs $acc3,$acc3,$t3 818 mul $t3,$a7,$n0 819 adcs $acc4,$acc4,$t0 820 umulh $t0,$a0,$n0 821 adcs $acc5,$acc5,$t1 822 umulh $t1,$a1,$n0 823 adcs $acc6,$acc6,$t2 824 umulh $t2,$a2,$n0 825 adcs $acc7,$acc7,$t3 826 umulh $t3,$a3,$n0 827 adc $carry,$carry,xzr 828 str $acc0,[$tp],#8 829 adds $acc0,$acc1,$t0 830 umulh $t0,$a4,$n0 831 adcs $acc1,$acc2,$t1 832 umulh $t1,$a5,$n0 833 adcs $acc2,$acc3,$t2 834 umulh $t2,$a6,$n0 835 adcs $acc3,$acc4,$t3 836 umulh $t3,$a7,$n0 837 ldr $n0,[$rp,$cnt] 838 adcs $acc4,$acc5,$t0 839 adcs $acc5,$acc6,$t1 840 adcs $acc6,$acc7,$t2 841 adcs $acc7,$carry,$t3 842 //adc $carry,xzr,xzr // moved above 843 cbnz $cnt,.Lsqr8x_tail 844 // note that carry flag is guaranteed 845 // to be zero at this point 846 ldp $a0,$a1,[$tp,#8*0] 847 sub $cnt,$np_end,$np // done yet? 848 sub $t2,$np_end,$num // rewinded np 849 ldp $a2,$a3,[$tp,#8*2] 850 ldp $a4,$a5,[$tp,#8*4] 851 ldp $a6,$a7,[$tp,#8*6] 852 cbz $cnt,.Lsqr8x_tail_break 853 854 ldr $n0,[$rp,#-8*8] 855 adds $acc0,$acc0,$a0 856 adcs $acc1,$acc1,$a1 857 ldp $a0,$a1,[$np,#8*0] 858 adcs $acc2,$acc2,$a2 859 adcs $acc3,$acc3,$a3 860 ldp $a2,$a3,[$np,#8*2] 861 adcs $acc4,$acc4,$a4 862 adcs $acc5,$acc5,$a5 863 ldp $a4,$a5,[$np,#8*4] 864 adcs $acc6,$acc6,$a6 865 mov $cnt,#-8*8 866 adcs $acc7,$acc7,$a7 867 ldp $a6,$a7,[$np,#8*6] 868 add $np,$np,#8*8 869 //adc $carry,xzr,xzr // moved above 870 b .Lsqr8x_tail 871 872 .align 4 873 .Lsqr8x_tail_break: 874 ldr $n0,[x29,#112] // pull n0 875 add $cnt,$tp,#8*8 // end of current t[num] window 876 877 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 878 adcs $t0,$acc0,$a0 879 adcs $t1,$acc1,$a1 880 ldp $acc0,$acc1,[$rp,#8*0] 881 adcs $acc2,$acc2,$a2 882 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 883 adcs $acc3,$acc3,$a3 884 ldp $a2,$a3,[$t2,#8*2] 885 adcs $acc4,$acc4,$a4 886 adcs $acc5,$acc5,$a5 887 ldp $a4,$a5,[$t2,#8*4] 888 adcs $acc6,$acc6,$a6 889 adcs $acc7,$acc7,$a7 890 ldp $a6,$a7,[$t2,#8*6] 891 add $np,$t2,#8*8 892 adc $topmost,xzr,xzr // top-most carry 893 mul $na0,$n0,$acc0 894 stp $t0,$t1,[$tp,#8*0] 895 stp $acc2,$acc3,[$tp,#8*2] 896 ldp $acc2,$acc3,[$rp,#8*2] 897 stp $acc4,$acc5,[$tp,#8*4] 898 ldp $acc4,$acc5,[$rp,#8*4] 899 cmp $cnt,x29 // did we hit the bottom? 900 stp $acc6,$acc7,[$tp,#8*6] 901 mov $tp,$rp // slide the window 902 ldp $acc6,$acc7,[$rp,#8*6] 903 mov $cnt,#8 904 b.ne .Lsqr8x_reduction 905 906 // Final step. We see if result is larger than modulus, and 907 // if it is, subtract the modulus. But comparison implies 908 // subtraction. So we subtract modulus, see if it borrowed, 909 // and conditionally copy original value. 910 ldr $rp,[x29,#96] // pull rp 911 add $tp,$tp,#8*8 912 subs $t0,$acc0,$a0 913 sbcs $t1,$acc1,$a1 914 sub $cnt,$num,#8*8 915 mov $ap_end,$rp // $rp copy 916 917 .Lsqr8x_sub: 918 sbcs $t2,$acc2,$a2 919 ldp $a0,$a1,[$np,#8*0] 920 sbcs $t3,$acc3,$a3 921 stp $t0,$t1,[$rp,#8*0] 922 sbcs $t0,$acc4,$a4 923 ldp $a2,$a3,[$np,#8*2] 924 sbcs $t1,$acc5,$a5 925 stp $t2,$t3,[$rp,#8*2] 926 sbcs $t2,$acc6,$a6 927 ldp $a4,$a5,[$np,#8*4] 928 sbcs $t3,$acc7,$a7 929 ldp $a6,$a7,[$np,#8*6] 930 add $np,$np,#8*8 931 ldp $acc0,$acc1,[$tp,#8*0] 932 sub $cnt,$cnt,#8*8 933 ldp $acc2,$acc3,[$tp,#8*2] 934 ldp $acc4,$acc5,[$tp,#8*4] 935 ldp $acc6,$acc7,[$tp,#8*6] 936 add $tp,$tp,#8*8 937 stp $t0,$t1,[$rp,#8*4] 938 sbcs $t0,$acc0,$a0 939 stp $t2,$t3,[$rp,#8*6] 940 add $rp,$rp,#8*8 941 sbcs $t1,$acc1,$a1 942 cbnz $cnt,.Lsqr8x_sub 943 944 sbcs $t2,$acc2,$a2 945 mov $tp,sp 946 add $ap,sp,$num 947 ldp $a0,$a1,[$ap_end,#8*0] 948 sbcs $t3,$acc3,$a3 949 stp $t0,$t1,[$rp,#8*0] 950 sbcs $t0,$acc4,$a4 951 ldp $a2,$a3,[$ap_end,#8*2] 952 sbcs $t1,$acc5,$a5 953 stp $t2,$t3,[$rp,#8*2] 954 sbcs $t2,$acc6,$a6 955 ldp $acc0,$acc1,[$ap,#8*0] 956 sbcs $t3,$acc7,$a7 957 ldp $acc2,$acc3,[$ap,#8*2] 958 sbcs xzr,$topmost,xzr // did it borrow? 959 ldr x30,[x29,#8] // pull return address 960 stp $t0,$t1,[$rp,#8*4] 961 stp $t2,$t3,[$rp,#8*6] 962 963 sub $cnt,$num,#8*4 964 .Lsqr4x_cond_copy: 965 sub $cnt,$cnt,#8*4 966 csel $t0,$acc0,$a0,lo 967 stp xzr,xzr,[$tp,#8*0] 968 csel $t1,$acc1,$a1,lo 969 ldp $a0,$a1,[$ap_end,#8*4] 970 ldp $acc0,$acc1,[$ap,#8*4] 971 csel $t2,$acc2,$a2,lo 972 stp xzr,xzr,[$tp,#8*2] 973 add $tp,$tp,#8*4 974 csel $t3,$acc3,$a3,lo 975 ldp $a2,$a3,[$ap_end,#8*6] 976 ldp $acc2,$acc3,[$ap,#8*6] 977 add $ap,$ap,#8*4 978 stp $t0,$t1,[$ap_end,#8*0] 979 stp $t2,$t3,[$ap_end,#8*2] 980 add $ap_end,$ap_end,#8*4 981 stp xzr,xzr,[$ap,#8*0] 982 stp xzr,xzr,[$ap,#8*2] 983 cbnz $cnt,.Lsqr4x_cond_copy 984 985 csel $t0,$acc0,$a0,lo 986 stp xzr,xzr,[$tp,#8*0] 987 csel $t1,$acc1,$a1,lo 988 stp xzr,xzr,[$tp,#8*2] 989 csel $t2,$acc2,$a2,lo 990 csel $t3,$acc3,$a3,lo 991 stp $t0,$t1,[$ap_end,#8*0] 992 stp $t2,$t3,[$ap_end,#8*2] 993 994 b .Lsqr8x_done 995 996 .align 4 997 .Lsqr8x8_post_condition: 998 adc $carry,xzr,xzr 999 ldr x30,[x29,#8] // pull return address 1000 // $acc0-7,$carry hold result, $a0-7 hold modulus 1001 subs $a0,$acc0,$a0 1002 ldr $ap,[x29,#96] // pull rp 1003 sbcs $a1,$acc1,$a1 1004 stp xzr,xzr,[sp,#8*0] 1005 sbcs $a2,$acc2,$a2 1006 stp xzr,xzr,[sp,#8*2] 1007 sbcs $a3,$acc3,$a3 1008 stp xzr,xzr,[sp,#8*4] 1009 sbcs $a4,$acc4,$a4 1010 stp xzr,xzr,[sp,#8*6] 1011 sbcs $a5,$acc5,$a5 1012 stp xzr,xzr,[sp,#8*8] 1013 sbcs $a6,$acc6,$a6 1014 stp xzr,xzr,[sp,#8*10] 1015 sbcs $a7,$acc7,$a7 1016 stp xzr,xzr,[sp,#8*12] 1017 sbcs $carry,$carry,xzr // did it borrow? 1018 stp xzr,xzr,[sp,#8*14] 1019 1020 // $a0-7 hold result-modulus 1021 csel $a0,$acc0,$a0,lo 1022 csel $a1,$acc1,$a1,lo 1023 csel $a2,$acc2,$a2,lo 1024 csel $a3,$acc3,$a3,lo 1025 stp $a0,$a1,[$ap,#8*0] 1026 csel $a4,$acc4,$a4,lo 1027 csel $a5,$acc5,$a5,lo 1028 stp $a2,$a3,[$ap,#8*2] 1029 csel $a6,$acc6,$a6,lo 1030 csel $a7,$acc7,$a7,lo 1031 stp $a4,$a5,[$ap,#8*4] 1032 stp $a6,$a7,[$ap,#8*6] 1033 1034 .Lsqr8x_done: 1035 ldp x19,x20,[x29,#16] 1036 mov sp,x29 1037 ldp x21,x22,[x29,#32] 1038 mov x0,#1 1039 ldp x23,x24,[x29,#48] 1040 ldp x25,x26,[x29,#64] 1041 ldp x27,x28,[x29,#80] 1042 ldr x29,[sp],#128 1043 ret 1044 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1045 ___ 1046 } 1047 1048 { 1049 ######################################################################## 1050 # Even though this might look as ARMv8 adaptation of mulx4x_mont from 1051 # x86_64-mont5 module, it's different in sense that it performs 1052 # reduction 256 bits at a time. 1053 1054 my ($a0,$a1,$a2,$a3, 1055 $t0,$t1,$t2,$t3, 1056 $m0,$m1,$m2,$m3, 1057 $acc0,$acc1,$acc2,$acc3,$acc4, 1058 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1059 my $bp_end=$rp; 1060 my ($carry,$topmost) = ($rp,"x30"); 1061 1062 $code.=<<___; 1063 .type __bn_mul4x_mont,%function 1064 .align 5 1065 __bn_mul4x_mont: 1066 stp x29,x30,[sp,#-128]! 1067 add x29,sp,#0 1068 stp x19,x20,[sp,#16] 1069 stp x21,x22,[sp,#32] 1070 stp x23,x24,[sp,#48] 1071 stp x25,x26,[sp,#64] 1072 stp x27,x28,[sp,#80] 1073 1074 sub $tp,sp,$num,lsl#3 1075 lsl $num,$num,#3 1076 ldr $n0,[$n0] // *n0 1077 sub sp,$tp,#8*4 // alloca 1078 1079 add $t0,$bp,$num 1080 add $ap_end,$ap,$num 1081 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1082 1083 ldr $bi,[$bp,#8*0] // b[0] 1084 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1085 ldp $a2,$a3,[$ap,#8*2] 1086 add $ap,$ap,#8*4 1087 mov $acc0,xzr 1088 mov $acc1,xzr 1089 mov $acc2,xzr 1090 mov $acc3,xzr 1091 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1092 ldp $m2,$m3,[$np,#8*2] 1093 adds $np,$np,#8*4 // clear carry bit 1094 mov $carry,xzr 1095 mov $cnt,#0 1096 mov $tp,sp 1097 1098 .Loop_mul4x_1st_reduction: 1099 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1100 adc $carry,$carry,xzr // modulo-scheduled 1101 mul $t1,$a1,$bi 1102 add $cnt,$cnt,#8 1103 mul $t2,$a2,$bi 1104 and $cnt,$cnt,#31 1105 mul $t3,$a3,$bi 1106 adds $acc0,$acc0,$t0 1107 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1108 adcs $acc1,$acc1,$t1 1109 mul $mi,$acc0,$n0 // t[0]*n0 1110 adcs $acc2,$acc2,$t2 1111 umulh $t1,$a1,$bi 1112 adcs $acc3,$acc3,$t3 1113 umulh $t2,$a2,$bi 1114 adc $acc4,xzr,xzr 1115 umulh $t3,$a3,$bi 1116 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1117 adds $acc1,$acc1,$t0 1118 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1119 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1120 adcs $acc2,$acc2,$t1 1121 mul $t1,$m1,$mi 1122 adcs $acc3,$acc3,$t2 1123 mul $t2,$m2,$mi 1124 adc $acc4,$acc4,$t3 // can't overflow 1125 mul $t3,$m3,$mi 1126 // (*) adds xzr,$acc0,$t0 1127 subs xzr,$acc0,#1 // (*) 1128 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1129 adcs $acc0,$acc1,$t1 1130 umulh $t1,$m1,$mi 1131 adcs $acc1,$acc2,$t2 1132 umulh $t2,$m2,$mi 1133 adcs $acc2,$acc3,$t3 1134 umulh $t3,$m3,$mi 1135 adcs $acc3,$acc4,$carry 1136 adc $carry,xzr,xzr 1137 adds $acc0,$acc0,$t0 1138 sub $t0,$ap_end,$ap 1139 adcs $acc1,$acc1,$t1 1140 adcs $acc2,$acc2,$t2 1141 adcs $acc3,$acc3,$t3 1142 //adc $carry,$carry,xzr 1143 cbnz $cnt,.Loop_mul4x_1st_reduction 1144 1145 cbz $t0,.Lmul4x4_post_condition 1146 1147 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1148 ldp $a2,$a3,[$ap,#8*2] 1149 add $ap,$ap,#8*4 1150 ldr $mi,[sp] // a[0]*n0 1151 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1152 ldp $m2,$m3,[$np,#8*2] 1153 add $np,$np,#8*4 1154 1155 .Loop_mul4x_1st_tail: 1156 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1157 adc $carry,$carry,xzr // modulo-scheduled 1158 mul $t1,$a1,$bi 1159 add $cnt,$cnt,#8 1160 mul $t2,$a2,$bi 1161 and $cnt,$cnt,#31 1162 mul $t3,$a3,$bi 1163 adds $acc0,$acc0,$t0 1164 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1165 adcs $acc1,$acc1,$t1 1166 umulh $t1,$a1,$bi 1167 adcs $acc2,$acc2,$t2 1168 umulh $t2,$a2,$bi 1169 adcs $acc3,$acc3,$t3 1170 umulh $t3,$a3,$bi 1171 adc $acc4,xzr,xzr 1172 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1173 adds $acc1,$acc1,$t0 1174 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1175 adcs $acc2,$acc2,$t1 1176 mul $t1,$m1,$mi 1177 adcs $acc3,$acc3,$t2 1178 mul $t2,$m2,$mi 1179 adc $acc4,$acc4,$t3 // can't overflow 1180 mul $t3,$m3,$mi 1181 adds $acc0,$acc0,$t0 1182 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1183 adcs $acc1,$acc1,$t1 1184 umulh $t1,$m1,$mi 1185 adcs $acc2,$acc2,$t2 1186 umulh $t2,$m2,$mi 1187 adcs $acc3,$acc3,$t3 1188 adcs $acc4,$acc4,$carry 1189 umulh $t3,$m3,$mi 1190 adc $carry,xzr,xzr 1191 ldr $mi,[sp,$cnt] // next t[0]*n0 1192 str $acc0,[$tp],#8 // result!!! 1193 adds $acc0,$acc1,$t0 1194 sub $t0,$ap_end,$ap // done yet? 1195 adcs $acc1,$acc2,$t1 1196 adcs $acc2,$acc3,$t2 1197 adcs $acc3,$acc4,$t3 1198 //adc $carry,$carry,xzr 1199 cbnz $cnt,.Loop_mul4x_1st_tail 1200 1201 sub $t1,$ap_end,$num // rewinded $ap 1202 cbz $t0,.Lmul4x_proceed 1203 1204 ldp $a0,$a1,[$ap,#8*0] 1205 ldp $a2,$a3,[$ap,#8*2] 1206 add $ap,$ap,#8*4 1207 ldp $m0,$m1,[$np,#8*0] 1208 ldp $m2,$m3,[$np,#8*2] 1209 add $np,$np,#8*4 1210 b .Loop_mul4x_1st_tail 1211 1212 .align 5 1213 .Lmul4x_proceed: 1214 ldr $bi,[$bp,#8*4]! // *++b 1215 adc $topmost,$carry,xzr 1216 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1217 sub $np,$np,$num // rewind np 1218 ldp $a2,$a3,[$t1,#8*2] 1219 add $ap,$t1,#8*4 1220 1221 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1222 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1223 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1224 ldp $acc2,$acc3,[sp,#8*6] 1225 1226 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1227 mov $tp,sp 1228 ldp $m2,$m3,[$np,#8*2] 1229 adds $np,$np,#8*4 // clear carry bit 1230 mov $carry,xzr 1231 1232 .align 4 1233 .Loop_mul4x_reduction: 1234 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1235 adc $carry,$carry,xzr // modulo-scheduled 1236 mul $t1,$a1,$bi 1237 add $cnt,$cnt,#8 1238 mul $t2,$a2,$bi 1239 and $cnt,$cnt,#31 1240 mul $t3,$a3,$bi 1241 adds $acc0,$acc0,$t0 1242 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1243 adcs $acc1,$acc1,$t1 1244 mul $mi,$acc0,$n0 // t[0]*n0 1245 adcs $acc2,$acc2,$t2 1246 umulh $t1,$a1,$bi 1247 adcs $acc3,$acc3,$t3 1248 umulh $t2,$a2,$bi 1249 adc $acc4,xzr,xzr 1250 umulh $t3,$a3,$bi 1251 ldr $bi,[$bp,$cnt] // next b[i] 1252 adds $acc1,$acc1,$t0 1253 // (*) mul $t0,$m0,$mi 1254 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1255 adcs $acc2,$acc2,$t1 1256 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1257 adcs $acc3,$acc3,$t2 1258 mul $t2,$m2,$mi 1259 adc $acc4,$acc4,$t3 // can't overflow 1260 mul $t3,$m3,$mi 1261 // (*) adds xzr,$acc0,$t0 1262 subs xzr,$acc0,#1 // (*) 1263 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1264 adcs $acc0,$acc1,$t1 1265 umulh $t1,$m1,$mi 1266 adcs $acc1,$acc2,$t2 1267 umulh $t2,$m2,$mi 1268 adcs $acc2,$acc3,$t3 1269 umulh $t3,$m3,$mi 1270 adcs $acc3,$acc4,$carry 1271 adc $carry,xzr,xzr 1272 adds $acc0,$acc0,$t0 1273 adcs $acc1,$acc1,$t1 1274 adcs $acc2,$acc2,$t2 1275 adcs $acc3,$acc3,$t3 1276 //adc $carry,$carry,xzr 1277 cbnz $cnt,.Loop_mul4x_reduction 1278 1279 adc $carry,$carry,xzr 1280 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1281 ldp $t2,$t3,[$tp,#8*6] 1282 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1283 ldp $a2,$a3,[$ap,#8*2] 1284 add $ap,$ap,#8*4 1285 adds $acc0,$acc0,$t0 1286 adcs $acc1,$acc1,$t1 1287 adcs $acc2,$acc2,$t2 1288 adcs $acc3,$acc3,$t3 1289 //adc $carry,$carry,xzr 1290 1291 ldr $mi,[sp] // t[0]*n0 1292 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1293 ldp $m2,$m3,[$np,#8*2] 1294 add $np,$np,#8*4 1295 1296 .align 4 1297 .Loop_mul4x_tail: 1298 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1299 adc $carry,$carry,xzr // modulo-scheduled 1300 mul $t1,$a1,$bi 1301 add $cnt,$cnt,#8 1302 mul $t2,$a2,$bi 1303 and $cnt,$cnt,#31 1304 mul $t3,$a3,$bi 1305 adds $acc0,$acc0,$t0 1306 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1307 adcs $acc1,$acc1,$t1 1308 umulh $t1,$a1,$bi 1309 adcs $acc2,$acc2,$t2 1310 umulh $t2,$a2,$bi 1311 adcs $acc3,$acc3,$t3 1312 umulh $t3,$a3,$bi 1313 adc $acc4,xzr,xzr 1314 ldr $bi,[$bp,$cnt] // next b[i] 1315 adds $acc1,$acc1,$t0 1316 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1317 adcs $acc2,$acc2,$t1 1318 mul $t1,$m1,$mi 1319 adcs $acc3,$acc3,$t2 1320 mul $t2,$m2,$mi 1321 adc $acc4,$acc4,$t3 // can't overflow 1322 mul $t3,$m3,$mi 1323 adds $acc0,$acc0,$t0 1324 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1325 adcs $acc1,$acc1,$t1 1326 umulh $t1,$m1,$mi 1327 adcs $acc2,$acc2,$t2 1328 umulh $t2,$m2,$mi 1329 adcs $acc3,$acc3,$t3 1330 umulh $t3,$m3,$mi 1331 adcs $acc4,$acc4,$carry 1332 ldr $mi,[sp,$cnt] // next a[0]*n0 1333 adc $carry,xzr,xzr 1334 str $acc0,[$tp],#8 // result!!! 1335 adds $acc0,$acc1,$t0 1336 sub $t0,$ap_end,$ap // done yet? 1337 adcs $acc1,$acc2,$t1 1338 adcs $acc2,$acc3,$t2 1339 adcs $acc3,$acc4,$t3 1340 //adc $carry,$carry,xzr 1341 cbnz $cnt,.Loop_mul4x_tail 1342 1343 sub $t1,$np,$num // rewinded np? 1344 adc $carry,$carry,xzr 1345 cbz $t0,.Loop_mul4x_break 1346 1347 ldp $t0,$t1,[$tp,#8*4] 1348 ldp $t2,$t3,[$tp,#8*6] 1349 ldp $a0,$a1,[$ap,#8*0] 1350 ldp $a2,$a3,[$ap,#8*2] 1351 add $ap,$ap,#8*4 1352 adds $acc0,$acc0,$t0 1353 adcs $acc1,$acc1,$t1 1354 adcs $acc2,$acc2,$t2 1355 adcs $acc3,$acc3,$t3 1356 //adc $carry,$carry,xzr 1357 ldp $m0,$m1,[$np,#8*0] 1358 ldp $m2,$m3,[$np,#8*2] 1359 add $np,$np,#8*4 1360 b .Loop_mul4x_tail 1361 1362 .align 4 1363 .Loop_mul4x_break: 1364 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1365 adds $acc0,$acc0,$topmost 1366 add $bp,$bp,#8*4 // bp++ 1367 adcs $acc1,$acc1,xzr 1368 sub $ap,$ap,$num // rewind ap 1369 adcs $acc2,$acc2,xzr 1370 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1371 adcs $acc3,$acc3,xzr 1372 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1373 adc $topmost,$carry,xzr 1374 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1375 cmp $bp,$t3 // done yet? 1376 ldp $acc2,$acc3,[sp,#8*6] 1377 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1378 ldp $m2,$m3,[$t1,#8*2] 1379 add $np,$t1,#8*4 1380 b.eq .Lmul4x_post 1381 1382 ldr $bi,[$bp] 1383 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1384 ldp $a2,$a3,[$ap,#8*2] 1385 adds $ap,$ap,#8*4 // clear carry bit 1386 mov $carry,xzr 1387 mov $tp,sp 1388 b .Loop_mul4x_reduction 1389 1390 .align 4 1391 .Lmul4x_post: 1392 // Final step. We see if result is larger than modulus, and 1393 // if it is, subtract the modulus. But comparison implies 1394 // subtraction. So we subtract modulus, see if it borrowed, 1395 // and conditionally copy original value. 1396 mov $rp,$t2 1397 mov $ap_end,$t2 // $rp copy 1398 subs $t0,$acc0,$m0 1399 add $tp,sp,#8*8 1400 sbcs $t1,$acc1,$m1 1401 sub $cnt,$num,#8*4 1402 1403 .Lmul4x_sub: 1404 sbcs $t2,$acc2,$m2 1405 ldp $m0,$m1,[$np,#8*0] 1406 sub $cnt,$cnt,#8*4 1407 ldp $acc0,$acc1,[$tp,#8*0] 1408 sbcs $t3,$acc3,$m3 1409 ldp $m2,$m3,[$np,#8*2] 1410 add $np,$np,#8*4 1411 ldp $acc2,$acc3,[$tp,#8*2] 1412 add $tp,$tp,#8*4 1413 stp $t0,$t1,[$rp,#8*0] 1414 sbcs $t0,$acc0,$m0 1415 stp $t2,$t3,[$rp,#8*2] 1416 add $rp,$rp,#8*4 1417 sbcs $t1,$acc1,$m1 1418 cbnz $cnt,.Lmul4x_sub 1419 1420 sbcs $t2,$acc2,$m2 1421 mov $tp,sp 1422 add $ap,sp,#8*4 1423 ldp $a0,$a1,[$ap_end,#8*0] 1424 sbcs $t3,$acc3,$m3 1425 stp $t0,$t1,[$rp,#8*0] 1426 ldp $a2,$a3,[$ap_end,#8*2] 1427 stp $t2,$t3,[$rp,#8*2] 1428 ldp $acc0,$acc1,[$ap,#8*0] 1429 ldp $acc2,$acc3,[$ap,#8*2] 1430 sbcs xzr,$topmost,xzr // did it borrow? 1431 ldr x30,[x29,#8] // pull return address 1432 1433 sub $cnt,$num,#8*4 1434 .Lmul4x_cond_copy: 1435 sub $cnt,$cnt,#8*4 1436 csel $t0,$acc0,$a0,lo 1437 stp xzr,xzr,[$tp,#8*0] 1438 csel $t1,$acc1,$a1,lo 1439 ldp $a0,$a1,[$ap_end,#8*4] 1440 ldp $acc0,$acc1,[$ap,#8*4] 1441 csel $t2,$acc2,$a2,lo 1442 stp xzr,xzr,[$tp,#8*2] 1443 add $tp,$tp,#8*4 1444 csel $t3,$acc3,$a3,lo 1445 ldp $a2,$a3,[$ap_end,#8*6] 1446 ldp $acc2,$acc3,[$ap,#8*6] 1447 add $ap,$ap,#8*4 1448 stp $t0,$t1,[$ap_end,#8*0] 1449 stp $t2,$t3,[$ap_end,#8*2] 1450 add $ap_end,$ap_end,#8*4 1451 cbnz $cnt,.Lmul4x_cond_copy 1452 1453 csel $t0,$acc0,$a0,lo 1454 stp xzr,xzr,[$tp,#8*0] 1455 csel $t1,$acc1,$a1,lo 1456 stp xzr,xzr,[$tp,#8*2] 1457 csel $t2,$acc2,$a2,lo 1458 stp xzr,xzr,[$tp,#8*3] 1459 csel $t3,$acc3,$a3,lo 1460 stp xzr,xzr,[$tp,#8*4] 1461 stp $t0,$t1,[$ap_end,#8*0] 1462 stp $t2,$t3,[$ap_end,#8*2] 1463 1464 b .Lmul4x_done 1465 1466 .align 4 1467 .Lmul4x4_post_condition: 1468 adc $carry,$carry,xzr 1469 ldr $ap,[x29,#96] // pull rp 1470 // $acc0-3,$carry hold result, $m0-7 hold modulus 1471 subs $a0,$acc0,$m0 1472 ldr x30,[x29,#8] // pull return address 1473 sbcs $a1,$acc1,$m1 1474 stp xzr,xzr,[sp,#8*0] 1475 sbcs $a2,$acc2,$m2 1476 stp xzr,xzr,[sp,#8*2] 1477 sbcs $a3,$acc3,$m3 1478 stp xzr,xzr,[sp,#8*4] 1479 sbcs xzr,$carry,xzr // did it borrow? 1480 stp xzr,xzr,[sp,#8*6] 1481 1482 // $a0-3 hold result-modulus 1483 csel $a0,$acc0,$a0,lo 1484 csel $a1,$acc1,$a1,lo 1485 csel $a2,$acc2,$a2,lo 1486 csel $a3,$acc3,$a3,lo 1487 stp $a0,$a1,[$ap,#8*0] 1488 stp $a2,$a3,[$ap,#8*2] 1489 1490 .Lmul4x_done: 1491 ldp x19,x20,[x29,#16] 1492 mov sp,x29 1493 ldp x21,x22,[x29,#32] 1494 mov x0,#1 1495 ldp x23,x24,[x29,#48] 1496 ldp x25,x26,[x29,#64] 1497 ldp x27,x28,[x29,#80] 1498 ldr x29,[sp],#128 1499 ret 1500 .size __bn_mul4x_mont,.-__bn_mul4x_mont 1501 ___ 1502 } 1503 $code.=<<___; 1504 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1505 .align 4 1506 ___ 1507 1508 print $code; 1509 1510 close STDOUT; 1511