1 #!/usr/bin/env perl 2 # 3 # Implemented as a Perl wrapper as we want to support several different 4 # architectures with single file. We pick up the target based on the 5 # file name we are asked to generate. 6 # 7 # It should be noted though that this perl code is nothing like 8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9 # as pre-processor to cover for platform differences in name decoration, 10 # linker tables, 32-/64-bit instruction sets... 11 # 12 # As you might know there're several PowerPC ABI in use. Most notably 13 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14 # are similar enough to implement leaf(!) functions, which would be ABI 15 # neutral. And that's what you find here: ABI neutral leaf functions. 16 # In case you wonder what that is... 17 # 18 # AIX performance 19 # 20 # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21 # 22 # The following is the performance of 32-bit compiler 23 # generated code: 24 # 25 # OpenSSL 0.9.6c 21 dec 2001 26 # built on: Tue Jun 11 11:06:51 EDT 2002 27 # options:bn(64,32) ... 28 #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29 # sign verify sign/s verify/s 30 #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31 #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32 #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33 #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34 #dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35 #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36 # 37 # Same bechmark with this assembler code: 38 # 39 #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40 #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41 #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42 #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43 #dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44 #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45 # 46 # Number of operations increases by at almost 75% 47 # 48 # Here are performance numbers for 64-bit compiler 49 # generated code: 50 # 51 # OpenSSL 0.9.6g [engine] 9 Aug 2002 52 # built on: Fri Apr 18 16:59:20 EDT 2003 53 # options:bn(64,64) ... 54 # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55 # sign verify sign/s verify/s 56 #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57 #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58 #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59 #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60 #dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61 #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62 # 63 # Same benchmark with this assembler code: 64 # 65 #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66 #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67 #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68 #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69 #dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70 #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71 # 72 # Again, performance increases by at about 75% 73 # 74 # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75 # OpenSSL 0.9.7c 30 Sep 2003 76 # 77 # Original code. 78 # 79 #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80 #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81 #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82 #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83 #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84 #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85 #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86 # 87 # Same benchmark with this assembler code: 88 # 89 #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90 #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91 #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92 #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96 # 97 # Performance increase of ~60% 98 # 99 # If you have comments or suggestions to improve code send 100 # me a note at schari (at] us.ibm.com 101 # 102 103 $flavour = shift; 104 105 if ($flavour =~ /32/) { 106 $BITS= 32; 107 $BNSZ= $BITS/8; 108 $ISA= "\"ppc\""; 109 110 $LD= "lwz"; # load 111 $LDU= "lwzu"; # load and update 112 $ST= "stw"; # store 113 $STU= "stwu"; # store and update 114 $UMULL= "mullw"; # unsigned multiply low 115 $UMULH= "mulhwu"; # unsigned multiply high 116 $UDIV= "divwu"; # unsigned divide 117 $UCMPI= "cmplwi"; # unsigned compare with immediate 118 $UCMP= "cmplw"; # unsigned compare 119 $CNTLZ= "cntlzw"; # count leading zeros 120 $SHL= "slw"; # shift left 121 $SHR= "srw"; # unsigned shift right 122 $SHRI= "srwi"; # unsigned shift right by immediate 123 $SHLI= "slwi"; # shift left by immediate 124 $CLRU= "clrlwi"; # clear upper bits 125 $INSR= "insrwi"; # insert right 126 $ROTL= "rotlwi"; # rotate left by immediate 127 $TR= "tw"; # conditional trap 128 } elsif ($flavour =~ /64/) { 129 $BITS= 64; 130 $BNSZ= $BITS/8; 131 $ISA= "\"ppc64\""; 132 133 # same as above, but 64-bit mnemonics... 134 $LD= "ld"; # load 135 $LDU= "ldu"; # load and update 136 $ST= "std"; # store 137 $STU= "stdu"; # store and update 138 $UMULL= "mulld"; # unsigned multiply low 139 $UMULH= "mulhdu"; # unsigned multiply high 140 $UDIV= "divdu"; # unsigned divide 141 $UCMPI= "cmpldi"; # unsigned compare with immediate 142 $UCMP= "cmpld"; # unsigned compare 143 $CNTLZ= "cntlzd"; # count leading zeros 144 $SHL= "sld"; # shift left 145 $SHR= "srd"; # unsigned shift right 146 $SHRI= "srdi"; # unsigned shift right by immediate 147 $SHLI= "sldi"; # shift left by immediate 148 $CLRU= "clrldi"; # clear upper bits 149 $INSR= "insrdi"; # insert right 150 $ROTL= "rotldi"; # rotate left by immediate 151 $TR= "td"; # conditional trap 152 } else { die "nonsense $flavour"; } 153 154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 157 die "can't locate ppc-xlate.pl"; 158 159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 160 161 $data=<<EOF; 162 #-------------------------------------------------------------------- 163 # 164 # 165 # 166 # 167 # File: ppc32.s 168 # 169 # Created by: Suresh Chari 170 # IBM Thomas J. Watson Research Library 171 # Hawthorne, NY 172 # 173 # 174 # Description: Optimized assembly routines for OpenSSL crypto 175 # on the 32 bitPowerPC platform. 176 # 177 # 178 # Version History 179 # 180 # 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 181 # cleaned up code. Also made a single version which can 182 # be used for both the AIX and Linux compilers. See NOTE 183 # below. 184 # 12/05/03 Suresh Chari 185 # (with lots of help from) Andy Polyakov 186 ## 187 # 1. Initial version 10/20/02 Suresh Chari 188 # 189 # 190 # The following file works for the xlc,cc 191 # and gcc compilers. 192 # 193 # NOTE: To get the file to link correctly with the gcc compiler 194 # you have to change the names of the routines and remove 195 # the first .(dot) character. This should automatically 196 # be done in the build process. 197 # 198 # Hand optimized assembly code for the following routines 199 # 200 # bn_sqr_comba4 201 # bn_sqr_comba8 202 # bn_mul_comba4 203 # bn_mul_comba8 204 # bn_sub_words 205 # bn_add_words 206 # bn_div_words 207 # bn_sqr_words 208 # bn_mul_words 209 # bn_mul_add_words 210 # 211 # NOTE: It is possible to optimize this code more for 212 # specific PowerPC or Power architectures. On the Northstar 213 # architecture the optimizations in this file do 214 # NOT provide much improvement. 215 # 216 # If you have comments or suggestions to improve code send 217 # me a note at schari\@us.ibm.com 218 # 219 #-------------------------------------------------------------------------- 220 # 221 # Defines to be used in the assembly code. 222 # 223 #.set r0,0 # we use it as storage for value of 0 224 #.set SP,1 # preserved 225 #.set RTOC,2 # preserved 226 #.set r3,3 # 1st argument/return value 227 #.set r4,4 # 2nd argument/volatile register 228 #.set r5,5 # 3rd argument/volatile register 229 #.set r6,6 # ... 230 #.set r7,7 231 #.set r8,8 232 #.set r9,9 233 #.set r10,10 234 #.set r11,11 235 #.set r12,12 236 #.set r13,13 # not used, nor any other "below" it... 237 238 # Declare function names to be global 239 # NOTE: For gcc these names MUST be changed to remove 240 # the first . i.e. for example change ".bn_sqr_comba4" 241 # to "bn_sqr_comba4". This should be automatically done 242 # in the build. 243 244 .globl .bn_sqr_comba4 245 .globl .bn_sqr_comba8 246 .globl .bn_mul_comba4 247 .globl .bn_mul_comba8 248 .globl .bn_sub_words 249 .globl .bn_add_words 250 .globl .bn_div_words 251 .globl .bn_sqr_words 252 .globl .bn_mul_words 253 .globl .bn_mul_add_words 254 255 # .text section 256 257 .machine "any" 258 259 # 260 # NOTE: The following label name should be changed to 261 # "bn_sqr_comba4" i.e. remove the first dot 262 # for the gcc compiler. This should be automatically 263 # done in the build 264 # 265 266 .align 4 267 .bn_sqr_comba4: 268 # 269 # Optimized version of bn_sqr_comba4. 270 # 271 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 272 # r3 contains r 273 # r4 contains a 274 # 275 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 276 # 277 # r5,r6 are the two BN_ULONGs being multiplied. 278 # r7,r8 are the results of the 32x32 giving 64 bit multiply. 279 # r9,r10, r11 are the equivalents of c1,c2, c3. 280 # Here's the assembly 281 # 282 # 283 xor r0,r0,r0 # set r0 = 0. Used in the addze 284 # instructions below 285 286 #sqr_add_c(a,0,c1,c2,c3) 287 $LD r5,`0*$BNSZ`(r4) 288 $UMULL r9,r5,r5 289 $UMULH r10,r5,r5 #in first iteration. No need 290 #to add since c1=c2=c3=0. 291 # Note c3(r11) is NOT set to 0 292 # but will be. 293 294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 295 # sqr_add_c2(a,1,0,c2,c3,c1); 296 $LD r6,`1*$BNSZ`(r4) 297 $UMULL r7,r5,r6 298 $UMULH r8,r5,r6 299 300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 301 adde r8,r8,r8 302 addze r9,r0 # catch carry if any. 303 # r9= r0(=0) and carry 304 305 addc r10,r7,r10 # now add to temp result. 306 addze r11,r8 # r8 added to r11 which is 0 307 addze r9,r9 308 309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 310 #sqr_add_c(a,1,c3,c1,c2) 311 $UMULL r7,r6,r6 312 $UMULH r8,r6,r6 313 addc r11,r7,r11 314 adde r9,r8,r9 315 addze r10,r0 316 #sqr_add_c2(a,2,0,c3,c1,c2) 317 $LD r6,`2*$BNSZ`(r4) 318 $UMULL r7,r5,r6 319 $UMULH r8,r5,r6 320 321 addc r7,r7,r7 322 adde r8,r8,r8 323 addze r10,r10 324 325 addc r11,r7,r11 326 adde r9,r8,r9 327 addze r10,r10 328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 329 #sqr_add_c2(a,3,0,c1,c2,c3); 330 $LD r6,`3*$BNSZ`(r4) 331 $UMULL r7,r5,r6 332 $UMULH r8,r5,r6 333 addc r7,r7,r7 334 adde r8,r8,r8 335 addze r11,r0 336 337 addc r9,r7,r9 338 adde r10,r8,r10 339 addze r11,r11 340 #sqr_add_c2(a,2,1,c1,c2,c3); 341 $LD r5,`1*$BNSZ`(r4) 342 $LD r6,`2*$BNSZ`(r4) 343 $UMULL r7,r5,r6 344 $UMULH r8,r5,r6 345 346 addc r7,r7,r7 347 adde r8,r8,r8 348 addze r11,r11 349 addc r9,r7,r9 350 adde r10,r8,r10 351 addze r11,r11 352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 353 #sqr_add_c(a,2,c2,c3,c1); 354 $UMULL r7,r6,r6 355 $UMULH r8,r6,r6 356 addc r10,r7,r10 357 adde r11,r8,r11 358 addze r9,r0 359 #sqr_add_c2(a,3,1,c2,c3,c1); 360 $LD r6,`3*$BNSZ`(r4) 361 $UMULL r7,r5,r6 362 $UMULH r8,r5,r6 363 addc r7,r7,r7 364 adde r8,r8,r8 365 addze r9,r9 366 367 addc r10,r7,r10 368 adde r11,r8,r11 369 addze r9,r9 370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 371 #sqr_add_c2(a,3,2,c3,c1,c2); 372 $LD r5,`2*$BNSZ`(r4) 373 $UMULL r7,r5,r6 374 $UMULH r8,r5,r6 375 addc r7,r7,r7 376 adde r8,r8,r8 377 addze r10,r0 378 379 addc r11,r7,r11 380 adde r9,r8,r9 381 addze r10,r10 382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 383 #sqr_add_c(a,3,c1,c2,c3); 384 $UMULL r7,r6,r6 385 $UMULH r8,r6,r6 386 addc r9,r7,r9 387 adde r10,r8,r10 388 389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 391 blr 392 .long 0x00000000 393 394 # 395 # NOTE: The following label name should be changed to 396 # "bn_sqr_comba8" i.e. remove the first dot 397 # for the gcc compiler. This should be automatically 398 # done in the build 399 # 400 401 .align 4 402 .bn_sqr_comba8: 403 # 404 # This is an optimized version of the bn_sqr_comba8 routine. 405 # Tightly uses the adde instruction 406 # 407 # 408 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 409 # r3 contains r 410 # r4 contains a 411 # 412 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 413 # 414 # r5,r6 are the two BN_ULONGs being multiplied. 415 # r7,r8 are the results of the 32x32 giving 64 bit multiply. 416 # r9,r10, r11 are the equivalents of c1,c2, c3. 417 # 418 # Possible optimization of loading all 8 longs of a into registers 419 # doesnt provide any speedup 420 # 421 422 xor r0,r0,r0 #set r0 = 0.Used in addze 423 #instructions below. 424 425 #sqr_add_c(a,0,c1,c2,c3); 426 $LD r5,`0*$BNSZ`(r4) 427 $UMULL r9,r5,r5 #1st iteration: no carries. 428 $UMULH r10,r5,r5 429 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 430 #sqr_add_c2(a,1,0,c2,c3,c1); 431 $LD r6,`1*$BNSZ`(r4) 432 $UMULL r7,r5,r6 433 $UMULH r8,r5,r6 434 435 addc r10,r7,r10 #add the two register number 436 adde r11,r8,r0 # (r8,r7) to the three register 437 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 438 439 addc r10,r7,r10 #add the two register number 440 adde r11,r8,r11 # (r8,r7) to the three register 441 addze r9,r9 # number (r9,r11,r10). 442 443 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 444 445 #sqr_add_c(a,1,c3,c1,c2); 446 $UMULL r7,r6,r6 447 $UMULH r8,r6,r6 448 addc r11,r7,r11 449 adde r9,r8,r9 450 addze r10,r0 451 #sqr_add_c2(a,2,0,c3,c1,c2); 452 $LD r6,`2*$BNSZ`(r4) 453 $UMULL r7,r5,r6 454 $UMULH r8,r5,r6 455 456 addc r11,r7,r11 457 adde r9,r8,r9 458 addze r10,r10 459 460 addc r11,r7,r11 461 adde r9,r8,r9 462 addze r10,r10 463 464 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 465 #sqr_add_c2(a,3,0,c1,c2,c3); 466 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 467 $UMULL r7,r5,r6 468 $UMULH r8,r5,r6 469 470 addc r9,r7,r9 471 adde r10,r8,r10 472 addze r11,r0 473 474 addc r9,r7,r9 475 adde r10,r8,r10 476 addze r11,r11 477 #sqr_add_c2(a,2,1,c1,c2,c3); 478 $LD r5,`1*$BNSZ`(r4) 479 $LD r6,`2*$BNSZ`(r4) 480 $UMULL r7,r5,r6 481 $UMULH r8,r5,r6 482 483 addc r9,r7,r9 484 adde r10,r8,r10 485 addze r11,r11 486 487 addc r9,r7,r9 488 adde r10,r8,r10 489 addze r11,r11 490 491 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 492 #sqr_add_c(a,2,c2,c3,c1); 493 $UMULL r7,r6,r6 494 $UMULH r8,r6,r6 495 496 addc r10,r7,r10 497 adde r11,r8,r11 498 addze r9,r0 499 #sqr_add_c2(a,3,1,c2,c3,c1); 500 $LD r6,`3*$BNSZ`(r4) 501 $UMULL r7,r5,r6 502 $UMULH r8,r5,r6 503 504 addc r10,r7,r10 505 adde r11,r8,r11 506 addze r9,r9 507 508 addc r10,r7,r10 509 adde r11,r8,r11 510 addze r9,r9 511 #sqr_add_c2(a,4,0,c2,c3,c1); 512 $LD r5,`0*$BNSZ`(r4) 513 $LD r6,`4*$BNSZ`(r4) 514 $UMULL r7,r5,r6 515 $UMULH r8,r5,r6 516 517 addc r10,r7,r10 518 adde r11,r8,r11 519 addze r9,r9 520 521 addc r10,r7,r10 522 adde r11,r8,r11 523 addze r9,r9 524 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 525 #sqr_add_c2(a,5,0,c3,c1,c2); 526 $LD r6,`5*$BNSZ`(r4) 527 $UMULL r7,r5,r6 528 $UMULH r8,r5,r6 529 530 addc r11,r7,r11 531 adde r9,r8,r9 532 addze r10,r0 533 534 addc r11,r7,r11 535 adde r9,r8,r9 536 addze r10,r10 537 #sqr_add_c2(a,4,1,c3,c1,c2); 538 $LD r5,`1*$BNSZ`(r4) 539 $LD r6,`4*$BNSZ`(r4) 540 $UMULL r7,r5,r6 541 $UMULH r8,r5,r6 542 543 addc r11,r7,r11 544 adde r9,r8,r9 545 addze r10,r10 546 547 addc r11,r7,r11 548 adde r9,r8,r9 549 addze r10,r10 550 #sqr_add_c2(a,3,2,c3,c1,c2); 551 $LD r5,`2*$BNSZ`(r4) 552 $LD r6,`3*$BNSZ`(r4) 553 $UMULL r7,r5,r6 554 $UMULH r8,r5,r6 555 556 addc r11,r7,r11 557 adde r9,r8,r9 558 addze r10,r10 559 560 addc r11,r7,r11 561 adde r9,r8,r9 562 addze r10,r10 563 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 564 #sqr_add_c(a,3,c1,c2,c3); 565 $UMULL r7,r6,r6 566 $UMULH r8,r6,r6 567 addc r9,r7,r9 568 adde r10,r8,r10 569 addze r11,r0 570 #sqr_add_c2(a,4,2,c1,c2,c3); 571 $LD r6,`4*$BNSZ`(r4) 572 $UMULL r7,r5,r6 573 $UMULH r8,r5,r6 574 575 addc r9,r7,r9 576 adde r10,r8,r10 577 addze r11,r11 578 579 addc r9,r7,r9 580 adde r10,r8,r10 581 addze r11,r11 582 #sqr_add_c2(a,5,1,c1,c2,c3); 583 $LD r5,`1*$BNSZ`(r4) 584 $LD r6,`5*$BNSZ`(r4) 585 $UMULL r7,r5,r6 586 $UMULH r8,r5,r6 587 588 addc r9,r7,r9 589 adde r10,r8,r10 590 addze r11,r11 591 592 addc r9,r7,r9 593 adde r10,r8,r10 594 addze r11,r11 595 #sqr_add_c2(a,6,0,c1,c2,c3); 596 $LD r5,`0*$BNSZ`(r4) 597 $LD r6,`6*$BNSZ`(r4) 598 $UMULL r7,r5,r6 599 $UMULH r8,r5,r6 600 addc r9,r7,r9 601 adde r10,r8,r10 602 addze r11,r11 603 addc r9,r7,r9 604 adde r10,r8,r10 605 addze r11,r11 606 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 607 #sqr_add_c2(a,7,0,c2,c3,c1); 608 $LD r6,`7*$BNSZ`(r4) 609 $UMULL r7,r5,r6 610 $UMULH r8,r5,r6 611 612 addc r10,r7,r10 613 adde r11,r8,r11 614 addze r9,r0 615 addc r10,r7,r10 616 adde r11,r8,r11 617 addze r9,r9 618 #sqr_add_c2(a,6,1,c2,c3,c1); 619 $LD r5,`1*$BNSZ`(r4) 620 $LD r6,`6*$BNSZ`(r4) 621 $UMULL r7,r5,r6 622 $UMULH r8,r5,r6 623 624 addc r10,r7,r10 625 adde r11,r8,r11 626 addze r9,r9 627 addc r10,r7,r10 628 adde r11,r8,r11 629 addze r9,r9 630 #sqr_add_c2(a,5,2,c2,c3,c1); 631 $LD r5,`2*$BNSZ`(r4) 632 $LD r6,`5*$BNSZ`(r4) 633 $UMULL r7,r5,r6 634 $UMULH r8,r5,r6 635 addc r10,r7,r10 636 adde r11,r8,r11 637 addze r9,r9 638 addc r10,r7,r10 639 adde r11,r8,r11 640 addze r9,r9 641 #sqr_add_c2(a,4,3,c2,c3,c1); 642 $LD r5,`3*$BNSZ`(r4) 643 $LD r6,`4*$BNSZ`(r4) 644 $UMULL r7,r5,r6 645 $UMULH r8,r5,r6 646 647 addc r10,r7,r10 648 adde r11,r8,r11 649 addze r9,r9 650 addc r10,r7,r10 651 adde r11,r8,r11 652 addze r9,r9 653 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 654 #sqr_add_c(a,4,c3,c1,c2); 655 $UMULL r7,r6,r6 656 $UMULH r8,r6,r6 657 addc r11,r7,r11 658 adde r9,r8,r9 659 addze r10,r0 660 #sqr_add_c2(a,5,3,c3,c1,c2); 661 $LD r6,`5*$BNSZ`(r4) 662 $UMULL r7,r5,r6 663 $UMULH r8,r5,r6 664 addc r11,r7,r11 665 adde r9,r8,r9 666 addze r10,r10 667 addc r11,r7,r11 668 adde r9,r8,r9 669 addze r10,r10 670 #sqr_add_c2(a,6,2,c3,c1,c2); 671 $LD r5,`2*$BNSZ`(r4) 672 $LD r6,`6*$BNSZ`(r4) 673 $UMULL r7,r5,r6 674 $UMULH r8,r5,r6 675 addc r11,r7,r11 676 adde r9,r8,r9 677 addze r10,r10 678 679 addc r11,r7,r11 680 adde r9,r8,r9 681 addze r10,r10 682 #sqr_add_c2(a,7,1,c3,c1,c2); 683 $LD r5,`1*$BNSZ`(r4) 684 $LD r6,`7*$BNSZ`(r4) 685 $UMULL r7,r5,r6 686 $UMULH r8,r5,r6 687 addc r11,r7,r11 688 adde r9,r8,r9 689 addze r10,r10 690 addc r11,r7,r11 691 adde r9,r8,r9 692 addze r10,r10 693 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 694 #sqr_add_c2(a,7,2,c1,c2,c3); 695 $LD r5,`2*$BNSZ`(r4) 696 $UMULL r7,r5,r6 697 $UMULH r8,r5,r6 698 699 addc r9,r7,r9 700 adde r10,r8,r10 701 addze r11,r0 702 addc r9,r7,r9 703 adde r10,r8,r10 704 addze r11,r11 705 #sqr_add_c2(a,6,3,c1,c2,c3); 706 $LD r5,`3*$BNSZ`(r4) 707 $LD r6,`6*$BNSZ`(r4) 708 $UMULL r7,r5,r6 709 $UMULH r8,r5,r6 710 addc r9,r7,r9 711 adde r10,r8,r10 712 addze r11,r11 713 addc r9,r7,r9 714 adde r10,r8,r10 715 addze r11,r11 716 #sqr_add_c2(a,5,4,c1,c2,c3); 717 $LD r5,`4*$BNSZ`(r4) 718 $LD r6,`5*$BNSZ`(r4) 719 $UMULL r7,r5,r6 720 $UMULH r8,r5,r6 721 addc r9,r7,r9 722 adde r10,r8,r10 723 addze r11,r11 724 addc r9,r7,r9 725 adde r10,r8,r10 726 addze r11,r11 727 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 728 #sqr_add_c(a,5,c2,c3,c1); 729 $UMULL r7,r6,r6 730 $UMULH r8,r6,r6 731 addc r10,r7,r10 732 adde r11,r8,r11 733 addze r9,r0 734 #sqr_add_c2(a,6,4,c2,c3,c1); 735 $LD r6,`6*$BNSZ`(r4) 736 $UMULL r7,r5,r6 737 $UMULH r8,r5,r6 738 addc r10,r7,r10 739 adde r11,r8,r11 740 addze r9,r9 741 addc r10,r7,r10 742 adde r11,r8,r11 743 addze r9,r9 744 #sqr_add_c2(a,7,3,c2,c3,c1); 745 $LD r5,`3*$BNSZ`(r4) 746 $LD r6,`7*$BNSZ`(r4) 747 $UMULL r7,r5,r6 748 $UMULH r8,r5,r6 749 addc r10,r7,r10 750 adde r11,r8,r11 751 addze r9,r9 752 addc r10,r7,r10 753 adde r11,r8,r11 754 addze r9,r9 755 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 756 #sqr_add_c2(a,7,4,c3,c1,c2); 757 $LD r5,`4*$BNSZ`(r4) 758 $UMULL r7,r5,r6 759 $UMULH r8,r5,r6 760 addc r11,r7,r11 761 adde r9,r8,r9 762 addze r10,r0 763 addc r11,r7,r11 764 adde r9,r8,r9 765 addze r10,r10 766 #sqr_add_c2(a,6,5,c3,c1,c2); 767 $LD r5,`5*$BNSZ`(r4) 768 $LD r6,`6*$BNSZ`(r4) 769 $UMULL r7,r5,r6 770 $UMULH r8,r5,r6 771 addc r11,r7,r11 772 adde r9,r8,r9 773 addze r10,r10 774 addc r11,r7,r11 775 adde r9,r8,r9 776 addze r10,r10 777 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 778 #sqr_add_c(a,6,c1,c2,c3); 779 $UMULL r7,r6,r6 780 $UMULH r8,r6,r6 781 addc r9,r7,r9 782 adde r10,r8,r10 783 addze r11,r0 784 #sqr_add_c2(a,7,5,c1,c2,c3) 785 $LD r6,`7*$BNSZ`(r4) 786 $UMULL r7,r5,r6 787 $UMULH r8,r5,r6 788 addc r9,r7,r9 789 adde r10,r8,r10 790 addze r11,r11 791 addc r9,r7,r9 792 adde r10,r8,r10 793 addze r11,r11 794 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 795 796 #sqr_add_c2(a,7,6,c2,c3,c1) 797 $LD r5,`6*$BNSZ`(r4) 798 $UMULL r7,r5,r6 799 $UMULH r8,r5,r6 800 addc r10,r7,r10 801 adde r11,r8,r11 802 addze r9,r0 803 addc r10,r7,r10 804 adde r11,r8,r11 805 addze r9,r9 806 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 807 #sqr_add_c(a,7,c3,c1,c2); 808 $UMULL r7,r6,r6 809 $UMULH r8,r6,r6 810 addc r11,r7,r11 811 adde r9,r8,r9 812 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 813 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 814 815 816 blr 817 818 .long 0x00000000 819 820 # 821 # NOTE: The following label name should be changed to 822 # "bn_mul_comba4" i.e. remove the first dot 823 # for the gcc compiler. This should be automatically 824 # done in the build 825 # 826 827 .align 4 828 .bn_mul_comba4: 829 # 830 # This is an optimized version of the bn_mul_comba4 routine. 831 # 832 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 833 # r3 contains r 834 # r4 contains a 835 # r5 contains b 836 # r6, r7 are the 2 BN_ULONGs being multiplied. 837 # r8, r9 are the results of the 32x32 giving 64 multiply. 838 # r10, r11, r12 are the equivalents of c1, c2, and c3. 839 # 840 xor r0,r0,r0 #r0=0. Used in addze below. 841 #mul_add_c(a[0],b[0],c1,c2,c3); 842 $LD r6,`0*$BNSZ`(r4) 843 $LD r7,`0*$BNSZ`(r5) 844 $UMULL r10,r6,r7 845 $UMULH r11,r6,r7 846 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 847 #mul_add_c(a[0],b[1],c2,c3,c1); 848 $LD r7,`1*$BNSZ`(r5) 849 $UMULL r8,r6,r7 850 $UMULH r9,r6,r7 851 addc r11,r8,r11 852 adde r12,r9,r0 853 addze r10,r0 854 #mul_add_c(a[1],b[0],c2,c3,c1); 855 $LD r6, `1*$BNSZ`(r4) 856 $LD r7, `0*$BNSZ`(r5) 857 $UMULL r8,r6,r7 858 $UMULH r9,r6,r7 859 addc r11,r8,r11 860 adde r12,r9,r12 861 addze r10,r10 862 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 863 #mul_add_c(a[2],b[0],c3,c1,c2); 864 $LD r6,`2*$BNSZ`(r4) 865 $UMULL r8,r6,r7 866 $UMULH r9,r6,r7 867 addc r12,r8,r12 868 adde r10,r9,r10 869 addze r11,r0 870 #mul_add_c(a[1],b[1],c3,c1,c2); 871 $LD r6,`1*$BNSZ`(r4) 872 $LD r7,`1*$BNSZ`(r5) 873 $UMULL r8,r6,r7 874 $UMULH r9,r6,r7 875 addc r12,r8,r12 876 adde r10,r9,r10 877 addze r11,r11 878 #mul_add_c(a[0],b[2],c3,c1,c2); 879 $LD r6,`0*$BNSZ`(r4) 880 $LD r7,`2*$BNSZ`(r5) 881 $UMULL r8,r6,r7 882 $UMULH r9,r6,r7 883 addc r12,r8,r12 884 adde r10,r9,r10 885 addze r11,r11 886 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 887 #mul_add_c(a[0],b[3],c1,c2,c3); 888 $LD r7,`3*$BNSZ`(r5) 889 $UMULL r8,r6,r7 890 $UMULH r9,r6,r7 891 addc r10,r8,r10 892 adde r11,r9,r11 893 addze r12,r0 894 #mul_add_c(a[1],b[2],c1,c2,c3); 895 $LD r6,`1*$BNSZ`(r4) 896 $LD r7,`2*$BNSZ`(r5) 897 $UMULL r8,r6,r7 898 $UMULH r9,r6,r7 899 addc r10,r8,r10 900 adde r11,r9,r11 901 addze r12,r12 902 #mul_add_c(a[2],b[1],c1,c2,c3); 903 $LD r6,`2*$BNSZ`(r4) 904 $LD r7,`1*$BNSZ`(r5) 905 $UMULL r8,r6,r7 906 $UMULH r9,r6,r7 907 addc r10,r8,r10 908 adde r11,r9,r11 909 addze r12,r12 910 #mul_add_c(a[3],b[0],c1,c2,c3); 911 $LD r6,`3*$BNSZ`(r4) 912 $LD r7,`0*$BNSZ`(r5) 913 $UMULL r8,r6,r7 914 $UMULH r9,r6,r7 915 addc r10,r8,r10 916 adde r11,r9,r11 917 addze r12,r12 918 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 919 #mul_add_c(a[3],b[1],c2,c3,c1); 920 $LD r7,`1*$BNSZ`(r5) 921 $UMULL r8,r6,r7 922 $UMULH r9,r6,r7 923 addc r11,r8,r11 924 adde r12,r9,r12 925 addze r10,r0 926 #mul_add_c(a[2],b[2],c2,c3,c1); 927 $LD r6,`2*$BNSZ`(r4) 928 $LD r7,`2*$BNSZ`(r5) 929 $UMULL r8,r6,r7 930 $UMULH r9,r6,r7 931 addc r11,r8,r11 932 adde r12,r9,r12 933 addze r10,r10 934 #mul_add_c(a[1],b[3],c2,c3,c1); 935 $LD r6,`1*$BNSZ`(r4) 936 $LD r7,`3*$BNSZ`(r5) 937 $UMULL r8,r6,r7 938 $UMULH r9,r6,r7 939 addc r11,r8,r11 940 adde r12,r9,r12 941 addze r10,r10 942 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 943 #mul_add_c(a[2],b[3],c3,c1,c2); 944 $LD r6,`2*$BNSZ`(r4) 945 $UMULL r8,r6,r7 946 $UMULH r9,r6,r7 947 addc r12,r8,r12 948 adde r10,r9,r10 949 addze r11,r0 950 #mul_add_c(a[3],b[2],c3,c1,c2); 951 $LD r6,`3*$BNSZ`(r4) 952 $LD r7,`2*$BNSZ`(r4) 953 $UMULL r8,r6,r7 954 $UMULH r9,r6,r7 955 addc r12,r8,r12 956 adde r10,r9,r10 957 addze r11,r11 958 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 959 #mul_add_c(a[3],b[3],c1,c2,c3); 960 $LD r7,`3*$BNSZ`(r5) 961 $UMULL r8,r6,r7 962 $UMULH r9,r6,r7 963 addc r10,r8,r10 964 adde r11,r9,r11 965 966 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 967 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 968 blr 969 .long 0x00000000 970 971 # 972 # NOTE: The following label name should be changed to 973 # "bn_mul_comba8" i.e. remove the first dot 974 # for the gcc compiler. This should be automatically 975 # done in the build 976 # 977 978 .align 4 979 .bn_mul_comba8: 980 # 981 # Optimized version of the bn_mul_comba8 routine. 982 # 983 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 984 # r3 contains r 985 # r4 contains a 986 # r5 contains b 987 # r6, r7 are the 2 BN_ULONGs being multiplied. 988 # r8, r9 are the results of the 32x32 giving 64 multiply. 989 # r10, r11, r12 are the equivalents of c1, c2, and c3. 990 # 991 xor r0,r0,r0 #r0=0. Used in addze below. 992 993 #mul_add_c(a[0],b[0],c1,c2,c3); 994 $LD r6,`0*$BNSZ`(r4) #a[0] 995 $LD r7,`0*$BNSZ`(r5) #b[0] 996 $UMULL r10,r6,r7 997 $UMULH r11,r6,r7 998 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 999 #mul_add_c(a[0],b[1],c2,c3,c1); 1000 $LD r7,`1*$BNSZ`(r5) 1001 $UMULL r8,r6,r7 1002 $UMULH r9,r6,r7 1003 addc r11,r11,r8 1004 addze r12,r9 # since we didnt set r12 to zero before. 1005 addze r10,r0 1006 #mul_add_c(a[1],b[0],c2,c3,c1); 1007 $LD r6,`1*$BNSZ`(r4) 1008 $LD r7,`0*$BNSZ`(r5) 1009 $UMULL r8,r6,r7 1010 $UMULH r9,r6,r7 1011 addc r11,r11,r8 1012 adde r12,r12,r9 1013 addze r10,r10 1014 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1015 #mul_add_c(a[2],b[0],c3,c1,c2); 1016 $LD r6,`2*$BNSZ`(r4) 1017 $UMULL r8,r6,r7 1018 $UMULH r9,r6,r7 1019 addc r12,r12,r8 1020 adde r10,r10,r9 1021 addze r11,r0 1022 #mul_add_c(a[1],b[1],c3,c1,c2); 1023 $LD r6,`1*$BNSZ`(r4) 1024 $LD r7,`1*$BNSZ`(r5) 1025 $UMULL r8,r6,r7 1026 $UMULH r9,r6,r7 1027 addc r12,r12,r8 1028 adde r10,r10,r9 1029 addze r11,r11 1030 #mul_add_c(a[0],b[2],c3,c1,c2); 1031 $LD r6,`0*$BNSZ`(r4) 1032 $LD r7,`2*$BNSZ`(r5) 1033 $UMULL r8,r6,r7 1034 $UMULH r9,r6,r7 1035 addc r12,r12,r8 1036 adde r10,r10,r9 1037 addze r11,r11 1038 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1039 #mul_add_c(a[0],b[3],c1,c2,c3); 1040 $LD r7,`3*$BNSZ`(r5) 1041 $UMULL r8,r6,r7 1042 $UMULH r9,r6,r7 1043 addc r10,r10,r8 1044 adde r11,r11,r9 1045 addze r12,r0 1046 #mul_add_c(a[1],b[2],c1,c2,c3); 1047 $LD r6,`1*$BNSZ`(r4) 1048 $LD r7,`2*$BNSZ`(r5) 1049 $UMULL r8,r6,r7 1050 $UMULH r9,r6,r7 1051 addc r10,r10,r8 1052 adde r11,r11,r9 1053 addze r12,r12 1054 1055 #mul_add_c(a[2],b[1],c1,c2,c3); 1056 $LD r6,`2*$BNSZ`(r4) 1057 $LD r7,`1*$BNSZ`(r5) 1058 $UMULL r8,r6,r7 1059 $UMULH r9,r6,r7 1060 addc r10,r10,r8 1061 adde r11,r11,r9 1062 addze r12,r12 1063 #mul_add_c(a[3],b[0],c1,c2,c3); 1064 $LD r6,`3*$BNSZ`(r4) 1065 $LD r7,`0*$BNSZ`(r5) 1066 $UMULL r8,r6,r7 1067 $UMULH r9,r6,r7 1068 addc r10,r10,r8 1069 adde r11,r11,r9 1070 addze r12,r12 1071 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1072 #mul_add_c(a[4],b[0],c2,c3,c1); 1073 $LD r6,`4*$BNSZ`(r4) 1074 $UMULL r8,r6,r7 1075 $UMULH r9,r6,r7 1076 addc r11,r11,r8 1077 adde r12,r12,r9 1078 addze r10,r0 1079 #mul_add_c(a[3],b[1],c2,c3,c1); 1080 $LD r6,`3*$BNSZ`(r4) 1081 $LD r7,`1*$BNSZ`(r5) 1082 $UMULL r8,r6,r7 1083 $UMULH r9,r6,r7 1084 addc r11,r11,r8 1085 adde r12,r12,r9 1086 addze r10,r10 1087 #mul_add_c(a[2],b[2],c2,c3,c1); 1088 $LD r6,`2*$BNSZ`(r4) 1089 $LD r7,`2*$BNSZ`(r5) 1090 $UMULL r8,r6,r7 1091 $UMULH r9,r6,r7 1092 addc r11,r11,r8 1093 adde r12,r12,r9 1094 addze r10,r10 1095 #mul_add_c(a[1],b[3],c2,c3,c1); 1096 $LD r6,`1*$BNSZ`(r4) 1097 $LD r7,`3*$BNSZ`(r5) 1098 $UMULL r8,r6,r7 1099 $UMULH r9,r6,r7 1100 addc r11,r11,r8 1101 adde r12,r12,r9 1102 addze r10,r10 1103 #mul_add_c(a[0],b[4],c2,c3,c1); 1104 $LD r6,`0*$BNSZ`(r4) 1105 $LD r7,`4*$BNSZ`(r5) 1106 $UMULL r8,r6,r7 1107 $UMULH r9,r6,r7 1108 addc r11,r11,r8 1109 adde r12,r12,r9 1110 addze r10,r10 1111 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1112 #mul_add_c(a[0],b[5],c3,c1,c2); 1113 $LD r7,`5*$BNSZ`(r5) 1114 $UMULL r8,r6,r7 1115 $UMULH r9,r6,r7 1116 addc r12,r12,r8 1117 adde r10,r10,r9 1118 addze r11,r0 1119 #mul_add_c(a[1],b[4],c3,c1,c2); 1120 $LD r6,`1*$BNSZ`(r4) 1121 $LD r7,`4*$BNSZ`(r5) 1122 $UMULL r8,r6,r7 1123 $UMULH r9,r6,r7 1124 addc r12,r12,r8 1125 adde r10,r10,r9 1126 addze r11,r11 1127 #mul_add_c(a[2],b[3],c3,c1,c2); 1128 $LD r6,`2*$BNSZ`(r4) 1129 $LD r7,`3*$BNSZ`(r5) 1130 $UMULL r8,r6,r7 1131 $UMULH r9,r6,r7 1132 addc r12,r12,r8 1133 adde r10,r10,r9 1134 addze r11,r11 1135 #mul_add_c(a[3],b[2],c3,c1,c2); 1136 $LD r6,`3*$BNSZ`(r4) 1137 $LD r7,`2*$BNSZ`(r5) 1138 $UMULL r8,r6,r7 1139 $UMULH r9,r6,r7 1140 addc r12,r12,r8 1141 adde r10,r10,r9 1142 addze r11,r11 1143 #mul_add_c(a[4],b[1],c3,c1,c2); 1144 $LD r6,`4*$BNSZ`(r4) 1145 $LD r7,`1*$BNSZ`(r5) 1146 $UMULL r8,r6,r7 1147 $UMULH r9,r6,r7 1148 addc r12,r12,r8 1149 adde r10,r10,r9 1150 addze r11,r11 1151 #mul_add_c(a[5],b[0],c3,c1,c2); 1152 $LD r6,`5*$BNSZ`(r4) 1153 $LD r7,`0*$BNSZ`(r5) 1154 $UMULL r8,r6,r7 1155 $UMULH r9,r6,r7 1156 addc r12,r12,r8 1157 adde r10,r10,r9 1158 addze r11,r11 1159 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1160 #mul_add_c(a[6],b[0],c1,c2,c3); 1161 $LD r6,`6*$BNSZ`(r4) 1162 $UMULL r8,r6,r7 1163 $UMULH r9,r6,r7 1164 addc r10,r10,r8 1165 adde r11,r11,r9 1166 addze r12,r0 1167 #mul_add_c(a[5],b[1],c1,c2,c3); 1168 $LD r6,`5*$BNSZ`(r4) 1169 $LD r7,`1*$BNSZ`(r5) 1170 $UMULL r8,r6,r7 1171 $UMULH r9,r6,r7 1172 addc r10,r10,r8 1173 adde r11,r11,r9 1174 addze r12,r12 1175 #mul_add_c(a[4],b[2],c1,c2,c3); 1176 $LD r6,`4*$BNSZ`(r4) 1177 $LD r7,`2*$BNSZ`(r5) 1178 $UMULL r8,r6,r7 1179 $UMULH r9,r6,r7 1180 addc r10,r10,r8 1181 adde r11,r11,r9 1182 addze r12,r12 1183 #mul_add_c(a[3],b[3],c1,c2,c3); 1184 $LD r6,`3*$BNSZ`(r4) 1185 $LD r7,`3*$BNSZ`(r5) 1186 $UMULL r8,r6,r7 1187 $UMULH r9,r6,r7 1188 addc r10,r10,r8 1189 adde r11,r11,r9 1190 addze r12,r12 1191 #mul_add_c(a[2],b[4],c1,c2,c3); 1192 $LD r6,`2*$BNSZ`(r4) 1193 $LD r7,`4*$BNSZ`(r5) 1194 $UMULL r8,r6,r7 1195 $UMULH r9,r6,r7 1196 addc r10,r10,r8 1197 adde r11,r11,r9 1198 addze r12,r12 1199 #mul_add_c(a[1],b[5],c1,c2,c3); 1200 $LD r6,`1*$BNSZ`(r4) 1201 $LD r7,`5*$BNSZ`(r5) 1202 $UMULL r8,r6,r7 1203 $UMULH r9,r6,r7 1204 addc r10,r10,r8 1205 adde r11,r11,r9 1206 addze r12,r12 1207 #mul_add_c(a[0],b[6],c1,c2,c3); 1208 $LD r6,`0*$BNSZ`(r4) 1209 $LD r7,`6*$BNSZ`(r5) 1210 $UMULL r8,r6,r7 1211 $UMULH r9,r6,r7 1212 addc r10,r10,r8 1213 adde r11,r11,r9 1214 addze r12,r12 1215 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1216 #mul_add_c(a[0],b[7],c2,c3,c1); 1217 $LD r7,`7*$BNSZ`(r5) 1218 $UMULL r8,r6,r7 1219 $UMULH r9,r6,r7 1220 addc r11,r11,r8 1221 adde r12,r12,r9 1222 addze r10,r0 1223 #mul_add_c(a[1],b[6],c2,c3,c1); 1224 $LD r6,`1*$BNSZ`(r4) 1225 $LD r7,`6*$BNSZ`(r5) 1226 $UMULL r8,r6,r7 1227 $UMULH r9,r6,r7 1228 addc r11,r11,r8 1229 adde r12,r12,r9 1230 addze r10,r10 1231 #mul_add_c(a[2],b[5],c2,c3,c1); 1232 $LD r6,`2*$BNSZ`(r4) 1233 $LD r7,`5*$BNSZ`(r5) 1234 $UMULL r8,r6,r7 1235 $UMULH r9,r6,r7 1236 addc r11,r11,r8 1237 adde r12,r12,r9 1238 addze r10,r10 1239 #mul_add_c(a[3],b[4],c2,c3,c1); 1240 $LD r6,`3*$BNSZ`(r4) 1241 $LD r7,`4*$BNSZ`(r5) 1242 $UMULL r8,r6,r7 1243 $UMULH r9,r6,r7 1244 addc r11,r11,r8 1245 adde r12,r12,r9 1246 addze r10,r10 1247 #mul_add_c(a[4],b[3],c2,c3,c1); 1248 $LD r6,`4*$BNSZ`(r4) 1249 $LD r7,`3*$BNSZ`(r5) 1250 $UMULL r8,r6,r7 1251 $UMULH r9,r6,r7 1252 addc r11,r11,r8 1253 adde r12,r12,r9 1254 addze r10,r10 1255 #mul_add_c(a[5],b[2],c2,c3,c1); 1256 $LD r6,`5*$BNSZ`(r4) 1257 $LD r7,`2*$BNSZ`(r5) 1258 $UMULL r8,r6,r7 1259 $UMULH r9,r6,r7 1260 addc r11,r11,r8 1261 adde r12,r12,r9 1262 addze r10,r10 1263 #mul_add_c(a[6],b[1],c2,c3,c1); 1264 $LD r6,`6*$BNSZ`(r4) 1265 $LD r7,`1*$BNSZ`(r5) 1266 $UMULL r8,r6,r7 1267 $UMULH r9,r6,r7 1268 addc r11,r11,r8 1269 adde r12,r12,r9 1270 addze r10,r10 1271 #mul_add_c(a[7],b[0],c2,c3,c1); 1272 $LD r6,`7*$BNSZ`(r4) 1273 $LD r7,`0*$BNSZ`(r5) 1274 $UMULL r8,r6,r7 1275 $UMULH r9,r6,r7 1276 addc r11,r11,r8 1277 adde r12,r12,r9 1278 addze r10,r10 1279 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1280 #mul_add_c(a[7],b[1],c3,c1,c2); 1281 $LD r7,`1*$BNSZ`(r5) 1282 $UMULL r8,r6,r7 1283 $UMULH r9,r6,r7 1284 addc r12,r12,r8 1285 adde r10,r10,r9 1286 addze r11,r0 1287 #mul_add_c(a[6],b[2],c3,c1,c2); 1288 $LD r6,`6*$BNSZ`(r4) 1289 $LD r7,`2*$BNSZ`(r5) 1290 $UMULL r8,r6,r7 1291 $UMULH r9,r6,r7 1292 addc r12,r12,r8 1293 adde r10,r10,r9 1294 addze r11,r11 1295 #mul_add_c(a[5],b[3],c3,c1,c2); 1296 $LD r6,`5*$BNSZ`(r4) 1297 $LD r7,`3*$BNSZ`(r5) 1298 $UMULL r8,r6,r7 1299 $UMULH r9,r6,r7 1300 addc r12,r12,r8 1301 adde r10,r10,r9 1302 addze r11,r11 1303 #mul_add_c(a[4],b[4],c3,c1,c2); 1304 $LD r6,`4*$BNSZ`(r4) 1305 $LD r7,`4*$BNSZ`(r5) 1306 $UMULL r8,r6,r7 1307 $UMULH r9,r6,r7 1308 addc r12,r12,r8 1309 adde r10,r10,r9 1310 addze r11,r11 1311 #mul_add_c(a[3],b[5],c3,c1,c2); 1312 $LD r6,`3*$BNSZ`(r4) 1313 $LD r7,`5*$BNSZ`(r5) 1314 $UMULL r8,r6,r7 1315 $UMULH r9,r6,r7 1316 addc r12,r12,r8 1317 adde r10,r10,r9 1318 addze r11,r11 1319 #mul_add_c(a[2],b[6],c3,c1,c2); 1320 $LD r6,`2*$BNSZ`(r4) 1321 $LD r7,`6*$BNSZ`(r5) 1322 $UMULL r8,r6,r7 1323 $UMULH r9,r6,r7 1324 addc r12,r12,r8 1325 adde r10,r10,r9 1326 addze r11,r11 1327 #mul_add_c(a[1],b[7],c3,c1,c2); 1328 $LD r6,`1*$BNSZ`(r4) 1329 $LD r7,`7*$BNSZ`(r5) 1330 $UMULL r8,r6,r7 1331 $UMULH r9,r6,r7 1332 addc r12,r12,r8 1333 adde r10,r10,r9 1334 addze r11,r11 1335 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1336 #mul_add_c(a[2],b[7],c1,c2,c3); 1337 $LD r6,`2*$BNSZ`(r4) 1338 $UMULL r8,r6,r7 1339 $UMULH r9,r6,r7 1340 addc r10,r10,r8 1341 adde r11,r11,r9 1342 addze r12,r0 1343 #mul_add_c(a[3],b[6],c1,c2,c3); 1344 $LD r6,`3*$BNSZ`(r4) 1345 $LD r7,`6*$BNSZ`(r5) 1346 $UMULL r8,r6,r7 1347 $UMULH r9,r6,r7 1348 addc r10,r10,r8 1349 adde r11,r11,r9 1350 addze r12,r12 1351 #mul_add_c(a[4],b[5],c1,c2,c3); 1352 $LD r6,`4*$BNSZ`(r4) 1353 $LD r7,`5*$BNSZ`(r5) 1354 $UMULL r8,r6,r7 1355 $UMULH r9,r6,r7 1356 addc r10,r10,r8 1357 adde r11,r11,r9 1358 addze r12,r12 1359 #mul_add_c(a[5],b[4],c1,c2,c3); 1360 $LD r6,`5*$BNSZ`(r4) 1361 $LD r7,`4*$BNSZ`(r5) 1362 $UMULL r8,r6,r7 1363 $UMULH r9,r6,r7 1364 addc r10,r10,r8 1365 adde r11,r11,r9 1366 addze r12,r12 1367 #mul_add_c(a[6],b[3],c1,c2,c3); 1368 $LD r6,`6*$BNSZ`(r4) 1369 $LD r7,`3*$BNSZ`(r5) 1370 $UMULL r8,r6,r7 1371 $UMULH r9,r6,r7 1372 addc r10,r10,r8 1373 adde r11,r11,r9 1374 addze r12,r12 1375 #mul_add_c(a[7],b[2],c1,c2,c3); 1376 $LD r6,`7*$BNSZ`(r4) 1377 $LD r7,`2*$BNSZ`(r5) 1378 $UMULL r8,r6,r7 1379 $UMULH r9,r6,r7 1380 addc r10,r10,r8 1381 adde r11,r11,r9 1382 addze r12,r12 1383 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1384 #mul_add_c(a[7],b[3],c2,c3,c1); 1385 $LD r7,`3*$BNSZ`(r5) 1386 $UMULL r8,r6,r7 1387 $UMULH r9,r6,r7 1388 addc r11,r11,r8 1389 adde r12,r12,r9 1390 addze r10,r0 1391 #mul_add_c(a[6],b[4],c2,c3,c1); 1392 $LD r6,`6*$BNSZ`(r4) 1393 $LD r7,`4*$BNSZ`(r5) 1394 $UMULL r8,r6,r7 1395 $UMULH r9,r6,r7 1396 addc r11,r11,r8 1397 adde r12,r12,r9 1398 addze r10,r10 1399 #mul_add_c(a[5],b[5],c2,c3,c1); 1400 $LD r6,`5*$BNSZ`(r4) 1401 $LD r7,`5*$BNSZ`(r5) 1402 $UMULL r8,r6,r7 1403 $UMULH r9,r6,r7 1404 addc r11,r11,r8 1405 adde r12,r12,r9 1406 addze r10,r10 1407 #mul_add_c(a[4],b[6],c2,c3,c1); 1408 $LD r6,`4*$BNSZ`(r4) 1409 $LD r7,`6*$BNSZ`(r5) 1410 $UMULL r8,r6,r7 1411 $UMULH r9,r6,r7 1412 addc r11,r11,r8 1413 adde r12,r12,r9 1414 addze r10,r10 1415 #mul_add_c(a[3],b[7],c2,c3,c1); 1416 $LD r6,`3*$BNSZ`(r4) 1417 $LD r7,`7*$BNSZ`(r5) 1418 $UMULL r8,r6,r7 1419 $UMULH r9,r6,r7 1420 addc r11,r11,r8 1421 adde r12,r12,r9 1422 addze r10,r10 1423 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1424 #mul_add_c(a[4],b[7],c3,c1,c2); 1425 $LD r6,`4*$BNSZ`(r4) 1426 $UMULL r8,r6,r7 1427 $UMULH r9,r6,r7 1428 addc r12,r12,r8 1429 adde r10,r10,r9 1430 addze r11,r0 1431 #mul_add_c(a[5],b[6],c3,c1,c2); 1432 $LD r6,`5*$BNSZ`(r4) 1433 $LD r7,`6*$BNSZ`(r5) 1434 $UMULL r8,r6,r7 1435 $UMULH r9,r6,r7 1436 addc r12,r12,r8 1437 adde r10,r10,r9 1438 addze r11,r11 1439 #mul_add_c(a[6],b[5],c3,c1,c2); 1440 $LD r6,`6*$BNSZ`(r4) 1441 $LD r7,`5*$BNSZ`(r5) 1442 $UMULL r8,r6,r7 1443 $UMULH r9,r6,r7 1444 addc r12,r12,r8 1445 adde r10,r10,r9 1446 addze r11,r11 1447 #mul_add_c(a[7],b[4],c3,c1,c2); 1448 $LD r6,`7*$BNSZ`(r4) 1449 $LD r7,`4*$BNSZ`(r5) 1450 $UMULL r8,r6,r7 1451 $UMULH r9,r6,r7 1452 addc r12,r12,r8 1453 adde r10,r10,r9 1454 addze r11,r11 1455 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1456 #mul_add_c(a[7],b[5],c1,c2,c3); 1457 $LD r7,`5*$BNSZ`(r5) 1458 $UMULL r8,r6,r7 1459 $UMULH r9,r6,r7 1460 addc r10,r10,r8 1461 adde r11,r11,r9 1462 addze r12,r0 1463 #mul_add_c(a[6],b[6],c1,c2,c3); 1464 $LD r6,`6*$BNSZ`(r4) 1465 $LD r7,`6*$BNSZ`(r5) 1466 $UMULL r8,r6,r7 1467 $UMULH r9,r6,r7 1468 addc r10,r10,r8 1469 adde r11,r11,r9 1470 addze r12,r12 1471 #mul_add_c(a[5],b[7],c1,c2,c3); 1472 $LD r6,`5*$BNSZ`(r4) 1473 $LD r7,`7*$BNSZ`(r5) 1474 $UMULL r8,r6,r7 1475 $UMULH r9,r6,r7 1476 addc r10,r10,r8 1477 adde r11,r11,r9 1478 addze r12,r12 1479 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1480 #mul_add_c(a[6],b[7],c2,c3,c1); 1481 $LD r6,`6*$BNSZ`(r4) 1482 $UMULL r8,r6,r7 1483 $UMULH r9,r6,r7 1484 addc r11,r11,r8 1485 adde r12,r12,r9 1486 addze r10,r0 1487 #mul_add_c(a[7],b[6],c2,c3,c1); 1488 $LD r6,`7*$BNSZ`(r4) 1489 $LD r7,`6*$BNSZ`(r5) 1490 $UMULL r8,r6,r7 1491 $UMULH r9,r6,r7 1492 addc r11,r11,r8 1493 adde r12,r12,r9 1494 addze r10,r10 1495 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1496 #mul_add_c(a[7],b[7],c3,c1,c2); 1497 $LD r7,`7*$BNSZ`(r5) 1498 $UMULL r8,r6,r7 1499 $UMULH r9,r6,r7 1500 addc r12,r12,r8 1501 adde r10,r10,r9 1502 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1503 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1504 blr 1505 .long 0x00000000 1506 1507 # 1508 # NOTE: The following label name should be changed to 1509 # "bn_sub_words" i.e. remove the first dot 1510 # for the gcc compiler. This should be automatically 1511 # done in the build 1512 # 1513 # 1514 .align 4 1515 .bn_sub_words: 1516 # 1517 # Handcoded version of bn_sub_words 1518 # 1519 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1520 # 1521 # r3 = r 1522 # r4 = a 1523 # r5 = b 1524 # r6 = n 1525 # 1526 # Note: No loop unrolling done since this is not a performance 1527 # critical loop. 1528 1529 xor r0,r0,r0 #set r0 = 0 1530 # 1531 # check for r6 = 0 AND set carry bit. 1532 # 1533 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1534 # if r6 > 0 then result !=0 1535 # In either case carry bit is set. 1536 beq Lppcasm_sub_adios 1537 addi r4,r4,-$BNSZ 1538 addi r3,r3,-$BNSZ 1539 addi r5,r5,-$BNSZ 1540 mtctr r6 1541 Lppcasm_sub_mainloop: 1542 $LDU r7,$BNSZ(r4) 1543 $LDU r8,$BNSZ(r5) 1544 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1545 # if carry = 1 this is r7-r8. Else it 1546 # is r7-r8 -1 as we need. 1547 $STU r6,$BNSZ(r3) 1548 bdnz- Lppcasm_sub_mainloop 1549 Lppcasm_sub_adios: 1550 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1551 andi. r3,r3,1 # keep only last bit. 1552 blr 1553 .long 0x00000000 1554 1555 1556 # 1557 # NOTE: The following label name should be changed to 1558 # "bn_add_words" i.e. remove the first dot 1559 # for the gcc compiler. This should be automatically 1560 # done in the build 1561 # 1562 1563 .align 4 1564 .bn_add_words: 1565 # 1566 # Handcoded version of bn_add_words 1567 # 1568 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1569 # 1570 # r3 = r 1571 # r4 = a 1572 # r5 = b 1573 # r6 = n 1574 # 1575 # Note: No loop unrolling done since this is not a performance 1576 # critical loop. 1577 1578 xor r0,r0,r0 1579 # 1580 # check for r6 = 0. Is this needed? 1581 # 1582 addic. r6,r6,0 #test r6 and clear carry bit. 1583 beq Lppcasm_add_adios 1584 addi r4,r4,-$BNSZ 1585 addi r3,r3,-$BNSZ 1586 addi r5,r5,-$BNSZ 1587 mtctr r6 1588 Lppcasm_add_mainloop: 1589 $LDU r7,$BNSZ(r4) 1590 $LDU r8,$BNSZ(r5) 1591 adde r8,r7,r8 1592 $STU r8,$BNSZ(r3) 1593 bdnz- Lppcasm_add_mainloop 1594 Lppcasm_add_adios: 1595 addze r3,r0 #return carry bit. 1596 blr 1597 .long 0x00000000 1598 1599 # 1600 # NOTE: The following label name should be changed to 1601 # "bn_div_words" i.e. remove the first dot 1602 # for the gcc compiler. This should be automatically 1603 # done in the build 1604 # 1605 1606 .align 4 1607 .bn_div_words: 1608 # 1609 # This is a cleaned up version of code generated by 1610 # the AIX compiler. The only optimization is to use 1611 # the PPC instruction to count leading zeros instead 1612 # of call to num_bits_word. Since this was compiled 1613 # only at level -O2 we can possibly squeeze it more? 1614 # 1615 # r3 = h 1616 # r4 = l 1617 # r5 = d 1618 1619 $UCMPI 0,r5,0 # compare r5 and 0 1620 bne Lppcasm_div1 # proceed if d!=0 1621 li r3,-1 # d=0 return -1 1622 blr 1623 Lppcasm_div1: 1624 xor r0,r0,r0 #r0=0 1625 li r8,$BITS 1626 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1627 beq Lppcasm_div2 #proceed if no leading zeros 1628 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1629 $SHR. r9,r3,r8 #are there any bits above r8'th? 1630 $TR 16,r9,r0 #if there're, signal to dump core... 1631 Lppcasm_div2: 1632 $UCMP 0,r3,r5 #h>=d? 1633 blt Lppcasm_div3 #goto Lppcasm_div3 if not 1634 subf r3,r5,r3 #h-=d ; 1635 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1636 cmpi 0,0,r7,0 # is (i == 0)? 1637 beq Lppcasm_div4 1638 $SHL r3,r3,r7 # h = (h<< i) 1639 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1640 $SHL r5,r5,r7 # d<<=i 1641 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1642 $SHL r4,r4,r7 # l <<=i 1643 Lppcasm_div4: 1644 $SHRI r9,r5,`$BITS/2` # r9 = dh 1645 # dl will be computed when needed 1646 # as it saves registers. 1647 li r6,2 #r6=2 1648 mtctr r6 #counter will be in count. 1649 Lppcasm_divouterloop: 1650 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1651 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1652 # compute here for innerloop. 1653 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1654 bne Lppcasm_div5 # goto Lppcasm_div5 if not 1655 1656 li r8,-1 1657 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1658 b Lppcasm_div6 1659 Lppcasm_div5: 1660 $UDIV r8,r3,r9 #q = h/dh 1661 Lppcasm_div6: 1662 $UMULL r12,r9,r8 #th = q*dh 1663 $CLRU r10,r5,`$BITS/2` #r10=dl 1664 $UMULL r6,r8,r10 #tl = q*dl 1665 1666 Lppcasm_divinnerloop: 1667 subf r10,r12,r3 #t = h -th 1668 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1669 addic. r7,r7,0 #test if r7 == 0. used below. 1670 # now want to compute 1671 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1672 # the following 2 instructions do that 1673 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1674 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1675 $UCMP cr1,r6,r7 # compare (tl <= r7) 1676 bne Lppcasm_divinnerexit 1677 ble cr1,Lppcasm_divinnerexit 1678 addi r8,r8,-1 #q-- 1679 subf r12,r9,r12 #th -=dh 1680 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1681 subf r6,r10,r6 #tl -=dl 1682 b Lppcasm_divinnerloop 1683 Lppcasm_divinnerexit: 1684 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1685 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1686 $UCMP cr1,r4,r11 # compare l and tl 1687 add r12,r12,r10 # th+=t 1688 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1689 addi r12,r12,1 # th++ 1690 Lppcasm_div7: 1691 subf r11,r11,r4 #r11=l-tl 1692 $UCMP cr1,r3,r12 #compare h and th 1693 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1694 addi r8,r8,-1 # q-- 1695 add r3,r5,r3 # h+=d 1696 Lppcasm_div8: 1697 subf r12,r12,r3 #r12 = h-th 1698 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1699 # want to compute 1700 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1701 # the following 2 instructions will do this. 1702 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1703 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1704 bdz Lppcasm_div9 #if (count==0) break ; 1705 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1706 b Lppcasm_divouterloop 1707 Lppcasm_div9: 1708 or r3,r8,r0 1709 blr 1710 .long 0x00000000 1711 1712 # 1713 # NOTE: The following label name should be changed to 1714 # "bn_sqr_words" i.e. remove the first dot 1715 # for the gcc compiler. This should be automatically 1716 # done in the build 1717 # 1718 .align 4 1719 .bn_sqr_words: 1720 # 1721 # Optimized version of bn_sqr_words 1722 # 1723 # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1724 # 1725 # r3 = r 1726 # r4 = a 1727 # r5 = n 1728 # 1729 # r6 = a[i]. 1730 # r7,r8 = product. 1731 # 1732 # No unrolling done here. Not performance critical. 1733 1734 addic. r5,r5,0 #test r5. 1735 beq Lppcasm_sqr_adios 1736 addi r4,r4,-$BNSZ 1737 addi r3,r3,-$BNSZ 1738 mtctr r5 1739 Lppcasm_sqr_mainloop: 1740 #sqr(r[0],r[1],a[0]); 1741 $LDU r6,$BNSZ(r4) 1742 $UMULL r7,r6,r6 1743 $UMULH r8,r6,r6 1744 $STU r7,$BNSZ(r3) 1745 $STU r8,$BNSZ(r3) 1746 bdnz- Lppcasm_sqr_mainloop 1747 Lppcasm_sqr_adios: 1748 blr 1749 .long 0x00000000 1750 1751 1752 # 1753 # NOTE: The following label name should be changed to 1754 # "bn_mul_words" i.e. remove the first dot 1755 # for the gcc compiler. This should be automatically 1756 # done in the build 1757 # 1758 1759 .align 4 1760 .bn_mul_words: 1761 # 1762 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1763 # 1764 # r3 = rp 1765 # r4 = ap 1766 # r5 = num 1767 # r6 = w 1768 xor r0,r0,r0 1769 xor r12,r12,r12 # used for carry 1770 rlwinm. r7,r5,30,2,31 # num >> 2 1771 beq Lppcasm_mw_REM 1772 mtctr r7 1773 Lppcasm_mw_LOOP: 1774 #mul(rp[0],ap[0],w,c1); 1775 $LD r8,`0*$BNSZ`(r4) 1776 $UMULL r9,r6,r8 1777 $UMULH r10,r6,r8 1778 addc r9,r9,r12 1779 #addze r10,r10 #carry is NOT ignored. 1780 #will be taken care of 1781 #in second spin below 1782 #using adde. 1783 $ST r9,`0*$BNSZ`(r3) 1784 #mul(rp[1],ap[1],w,c1); 1785 $LD r8,`1*$BNSZ`(r4) 1786 $UMULL r11,r6,r8 1787 $UMULH r12,r6,r8 1788 adde r11,r11,r10 1789 #addze r12,r12 1790 $ST r11,`1*$BNSZ`(r3) 1791 #mul(rp[2],ap[2],w,c1); 1792 $LD r8,`2*$BNSZ`(r4) 1793 $UMULL r9,r6,r8 1794 $UMULH r10,r6,r8 1795 adde r9,r9,r12 1796 #addze r10,r10 1797 $ST r9,`2*$BNSZ`(r3) 1798 #mul_add(rp[3],ap[3],w,c1); 1799 $LD r8,`3*$BNSZ`(r4) 1800 $UMULL r11,r6,r8 1801 $UMULH r12,r6,r8 1802 adde r11,r11,r10 1803 addze r12,r12 #this spin we collect carry into 1804 #r12 1805 $ST r11,`3*$BNSZ`(r3) 1806 1807 addi r3,r3,`4*$BNSZ` 1808 addi r4,r4,`4*$BNSZ` 1809 bdnz- Lppcasm_mw_LOOP 1810 1811 Lppcasm_mw_REM: 1812 andi. r5,r5,0x3 1813 beq Lppcasm_mw_OVER 1814 #mul(rp[0],ap[0],w,c1); 1815 $LD r8,`0*$BNSZ`(r4) 1816 $UMULL r9,r6,r8 1817 $UMULH r10,r6,r8 1818 addc r9,r9,r12 1819 addze r10,r10 1820 $ST r9,`0*$BNSZ`(r3) 1821 addi r12,r10,0 1822 1823 addi r5,r5,-1 1824 cmpli 0,0,r5,0 1825 beq Lppcasm_mw_OVER 1826 1827 1828 #mul(rp[1],ap[1],w,c1); 1829 $LD r8,`1*$BNSZ`(r4) 1830 $UMULL r9,r6,r8 1831 $UMULH r10,r6,r8 1832 addc r9,r9,r12 1833 addze r10,r10 1834 $ST r9,`1*$BNSZ`(r3) 1835 addi r12,r10,0 1836 1837 addi r5,r5,-1 1838 cmpli 0,0,r5,0 1839 beq Lppcasm_mw_OVER 1840 1841 #mul_add(rp[2],ap[2],w,c1); 1842 $LD r8,`2*$BNSZ`(r4) 1843 $UMULL r9,r6,r8 1844 $UMULH r10,r6,r8 1845 addc r9,r9,r12 1846 addze r10,r10 1847 $ST r9,`2*$BNSZ`(r3) 1848 addi r12,r10,0 1849 1850 Lppcasm_mw_OVER: 1851 addi r3,r12,0 1852 blr 1853 .long 0x00000000 1854 1855 # 1856 # NOTE: The following label name should be changed to 1857 # "bn_mul_add_words" i.e. remove the first dot 1858 # for the gcc compiler. This should be automatically 1859 # done in the build 1860 # 1861 1862 .align 4 1863 .bn_mul_add_words: 1864 # 1865 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1866 # 1867 # r3 = rp 1868 # r4 = ap 1869 # r5 = num 1870 # r6 = w 1871 # 1872 # empirical evidence suggests that unrolled version performs best!! 1873 # 1874 xor r0,r0,r0 #r0 = 0 1875 xor r12,r12,r12 #r12 = 0 . used for carry 1876 rlwinm. r7,r5,30,2,31 # num >> 2 1877 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1878 mtctr r7 1879 Lppcasm_maw_mainloop: 1880 #mul_add(rp[0],ap[0],w,c1); 1881 $LD r8,`0*$BNSZ`(r4) 1882 $LD r11,`0*$BNSZ`(r3) 1883 $UMULL r9,r6,r8 1884 $UMULH r10,r6,r8 1885 addc r9,r9,r12 #r12 is carry. 1886 addze r10,r10 1887 addc r9,r9,r11 1888 #addze r10,r10 1889 #the above instruction addze 1890 #is NOT needed. Carry will NOT 1891 #be ignored. It's not affected 1892 #by multiply and will be collected 1893 #in the next spin 1894 $ST r9,`0*$BNSZ`(r3) 1895 1896 #mul_add(rp[1],ap[1],w,c1); 1897 $LD r8,`1*$BNSZ`(r4) 1898 $LD r9,`1*$BNSZ`(r3) 1899 $UMULL r11,r6,r8 1900 $UMULH r12,r6,r8 1901 adde r11,r11,r10 #r10 is carry. 1902 addze r12,r12 1903 addc r11,r11,r9 1904 #addze r12,r12 1905 $ST r11,`1*$BNSZ`(r3) 1906 1907 #mul_add(rp[2],ap[2],w,c1); 1908 $LD r8,`2*$BNSZ`(r4) 1909 $UMULL r9,r6,r8 1910 $LD r11,`2*$BNSZ`(r3) 1911 $UMULH r10,r6,r8 1912 adde r9,r9,r12 1913 addze r10,r10 1914 addc r9,r9,r11 1915 #addze r10,r10 1916 $ST r9,`2*$BNSZ`(r3) 1917 1918 #mul_add(rp[3],ap[3],w,c1); 1919 $LD r8,`3*$BNSZ`(r4) 1920 $UMULL r11,r6,r8 1921 $LD r9,`3*$BNSZ`(r3) 1922 $UMULH r12,r6,r8 1923 adde r11,r11,r10 1924 addze r12,r12 1925 addc r11,r11,r9 1926 addze r12,r12 1927 $ST r11,`3*$BNSZ`(r3) 1928 addi r3,r3,`4*$BNSZ` 1929 addi r4,r4,`4*$BNSZ` 1930 bdnz- Lppcasm_maw_mainloop 1931 1932 Lppcasm_maw_leftover: 1933 andi. r5,r5,0x3 1934 beq Lppcasm_maw_adios 1935 addi r3,r3,-$BNSZ 1936 addi r4,r4,-$BNSZ 1937 #mul_add(rp[0],ap[0],w,c1); 1938 mtctr r5 1939 $LDU r8,$BNSZ(r4) 1940 $UMULL r9,r6,r8 1941 $UMULH r10,r6,r8 1942 $LDU r11,$BNSZ(r3) 1943 addc r9,r9,r11 1944 addze r10,r10 1945 addc r9,r9,r12 1946 addze r12,r10 1947 $ST r9,0(r3) 1948 1949 bdz Lppcasm_maw_adios 1950 #mul_add(rp[1],ap[1],w,c1); 1951 $LDU r8,$BNSZ(r4) 1952 $UMULL r9,r6,r8 1953 $UMULH r10,r6,r8 1954 $LDU r11,$BNSZ(r3) 1955 addc r9,r9,r11 1956 addze r10,r10 1957 addc r9,r9,r12 1958 addze r12,r10 1959 $ST r9,0(r3) 1960 1961 bdz Lppcasm_maw_adios 1962 #mul_add(rp[2],ap[2],w,c1); 1963 $LDU r8,$BNSZ(r4) 1964 $UMULL r9,r6,r8 1965 $UMULH r10,r6,r8 1966 $LDU r11,$BNSZ(r3) 1967 addc r9,r9,r11 1968 addze r10,r10 1969 addc r9,r9,r12 1970 addze r12,r10 1971 $ST r9,0(r3) 1972 1973 Lppcasm_maw_adios: 1974 addi r3,r12,0 1975 blr 1976 .long 0x00000000 1977 .align 4 1978 EOF 1979 $data =~ s/\`([^\`]*)\`/eval $1/gem; 1980 print $data; 1981 close STDOUT; 1982