1 #!/usr/bin/env perl 2 # 3 # Implemented as a Perl wrapper as we want to support several different 4 # architectures with single file. We pick up the target based on the 5 # file name we are asked to generate. 6 # 7 # It should be noted though that this perl code is nothing like 8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9 # as pre-processor to cover for platform differences in name decoration, 10 # linker tables, 32-/64-bit instruction sets... 11 # 12 # As you might know there're several PowerPC ABI in use. Most notably 13 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14 # are similar enough to implement leaf(!) functions, which would be ABI 15 # neutral. And that's what you find here: ABI neutral leaf functions. 16 # In case you wonder what that is... 17 # 18 # AIX performance 19 # 20 # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21 # 22 # The following is the performance of 32-bit compiler 23 # generated code: 24 # 25 # OpenSSL 0.9.6c 21 dec 2001 26 # built on: Tue Jun 11 11:06:51 EDT 2002 27 # options:bn(64,32) ... 28 #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29 # sign verify sign/s verify/s 30 #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31 #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32 #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33 #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34 #dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35 #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36 # 37 # Same bechmark with this assembler code: 38 # 39 #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40 #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41 #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42 #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43 #dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44 #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45 # 46 # Number of operations increases by at almost 75% 47 # 48 # Here are performance numbers for 64-bit compiler 49 # generated code: 50 # 51 # OpenSSL 0.9.6g [engine] 9 Aug 2002 52 # built on: Fri Apr 18 16:59:20 EDT 2003 53 # options:bn(64,64) ... 54 # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55 # sign verify sign/s verify/s 56 #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57 #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58 #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59 #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60 #dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61 #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62 # 63 # Same benchmark with this assembler code: 64 # 65 #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66 #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67 #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68 #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69 #dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70 #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71 # 72 # Again, performance increases by at about 75% 73 # 74 # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75 # OpenSSL 0.9.7c 30 Sep 2003 76 # 77 # Original code. 78 # 79 #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80 #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81 #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82 #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83 #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84 #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85 #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86 # 87 # Same benchmark with this assembler code: 88 # 89 #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90 #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91 #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92 #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96 # 97 # Performance increase of ~60% 98 # 99 # If you have comments or suggestions to improve code send 100 # me a note at schari (at] us.ibm.com 101 # 102 103 $flavour = shift; 104 105 if ($flavour =~ /32/) { 106 $BITS= 32; 107 $BNSZ= $BITS/8; 108 $ISA= "\"ppc\""; 109 110 $LD= "lwz"; # load 111 $LDU= "lwzu"; # load and update 112 $ST= "stw"; # store 113 $STU= "stwu"; # store and update 114 $UMULL= "mullw"; # unsigned multiply low 115 $UMULH= "mulhwu"; # unsigned multiply high 116 $UDIV= "divwu"; # unsigned divide 117 $UCMPI= "cmplwi"; # unsigned compare with immediate 118 $UCMP= "cmplw"; # unsigned compare 119 $CNTLZ= "cntlzw"; # count leading zeros 120 $SHL= "slw"; # shift left 121 $SHR= "srw"; # unsigned shift right 122 $SHRI= "srwi"; # unsigned shift right by immediate 123 $SHLI= "slwi"; # shift left by immediate 124 $CLRU= "clrlwi"; # clear upper bits 125 $INSR= "insrwi"; # insert right 126 $ROTL= "rotlwi"; # rotate left by immediate 127 $TR= "tw"; # conditional trap 128 } elsif ($flavour =~ /64/) { 129 $BITS= 64; 130 $BNSZ= $BITS/8; 131 $ISA= "\"ppc64\""; 132 133 # same as above, but 64-bit mnemonics... 134 $LD= "ld"; # load 135 $LDU= "ldu"; # load and update 136 $ST= "std"; # store 137 $STU= "stdu"; # store and update 138 $UMULL= "mulld"; # unsigned multiply low 139 $UMULH= "mulhdu"; # unsigned multiply high 140 $UDIV= "divdu"; # unsigned divide 141 $UCMPI= "cmpldi"; # unsigned compare with immediate 142 $UCMP= "cmpld"; # unsigned compare 143 $CNTLZ= "cntlzd"; # count leading zeros 144 $SHL= "sld"; # shift left 145 $SHR= "srd"; # unsigned shift right 146 $SHRI= "srdi"; # unsigned shift right by immediate 147 $SHLI= "sldi"; # shift left by immediate 148 $CLRU= "clrldi"; # clear upper bits 149 $INSR= "insrdi"; # insert right 150 $ROTL= "rotldi"; # rotate left by immediate 151 $TR= "td"; # conditional trap 152 } else { die "nonsense $flavour"; } 153 154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 157 die "can't locate ppc-xlate.pl"; 158 159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 160 161 $data=<<EOF; 162 #-------------------------------------------------------------------- 163 # 164 # 165 # 166 # 167 # File: ppc32.s 168 # 169 # Created by: Suresh Chari 170 # IBM Thomas J. Watson Research Library 171 # Hawthorne, NY 172 # 173 # 174 # Description: Optimized assembly routines for OpenSSL crypto 175 # on the 32 bitPowerPC platform. 176 # 177 # 178 # Version History 179 # 180 # 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 181 # cleaned up code. Also made a single version which can 182 # be used for both the AIX and Linux compilers. See NOTE 183 # below. 184 # 12/05/03 Suresh Chari 185 # (with lots of help from) Andy Polyakov 186 ## 187 # 1. Initial version 10/20/02 Suresh Chari 188 # 189 # 190 # The following file works for the xlc,cc 191 # and gcc compilers. 192 # 193 # NOTE: To get the file to link correctly with the gcc compiler 194 # you have to change the names of the routines and remove 195 # the first .(dot) character. This should automatically 196 # be done in the build process. 197 # 198 # Hand optimized assembly code for the following routines 199 # 200 # bn_sqr_comba4 201 # bn_sqr_comba8 202 # bn_mul_comba4 203 # bn_mul_comba8 204 # bn_sub_words 205 # bn_add_words 206 # bn_div_words 207 # bn_sqr_words 208 # bn_mul_words 209 # bn_mul_add_words 210 # 211 # NOTE: It is possible to optimize this code more for 212 # specific PowerPC or Power architectures. On the Northstar 213 # architecture the optimizations in this file do 214 # NOT provide much improvement. 215 # 216 # If you have comments or suggestions to improve code send 217 # me a note at schari\@us.ibm.com 218 # 219 #-------------------------------------------------------------------------- 220 # 221 # Defines to be used in the assembly code. 222 # 223 #.set r0,0 # we use it as storage for value of 0 224 #.set SP,1 # preserved 225 #.set RTOC,2 # preserved 226 #.set r3,3 # 1st argument/return value 227 #.set r4,4 # 2nd argument/volatile register 228 #.set r5,5 # 3rd argument/volatile register 229 #.set r6,6 # ... 230 #.set r7,7 231 #.set r8,8 232 #.set r9,9 233 #.set r10,10 234 #.set r11,11 235 #.set r12,12 236 #.set r13,13 # not used, nor any other "below" it... 237 238 # Declare function names to be global 239 # NOTE: For gcc these names MUST be changed to remove 240 # the first . i.e. for example change ".bn_sqr_comba4" 241 # to "bn_sqr_comba4". This should be automatically done 242 # in the build. 243 244 .globl .bn_sqr_comba4 245 .globl .bn_sqr_comba8 246 .globl .bn_mul_comba4 247 .globl .bn_mul_comba8 248 .globl .bn_sub_words 249 .globl .bn_add_words 250 .globl .bn_div_words 251 .globl .bn_sqr_words 252 .globl .bn_mul_words 253 .globl .bn_mul_add_words 254 255 # .text section 256 257 .machine "any" 258 259 # 260 # NOTE: The following label name should be changed to 261 # "bn_sqr_comba4" i.e. remove the first dot 262 # for the gcc compiler. This should be automatically 263 # done in the build 264 # 265 266 .align 4 267 .bn_sqr_comba4: 268 # 269 # Optimized version of bn_sqr_comba4. 270 # 271 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 272 # r3 contains r 273 # r4 contains a 274 # 275 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 276 # 277 # r5,r6 are the two BN_ULONGs being multiplied. 278 # r7,r8 are the results of the 32x32 giving 64 bit multiply. 279 # r9,r10, r11 are the equivalents of c1,c2, c3. 280 # Here's the assembly 281 # 282 # 283 xor r0,r0,r0 # set r0 = 0. Used in the addze 284 # instructions below 285 286 #sqr_add_c(a,0,c1,c2,c3) 287 $LD r5,`0*$BNSZ`(r4) 288 $UMULL r9,r5,r5 289 $UMULH r10,r5,r5 #in first iteration. No need 290 #to add since c1=c2=c3=0. 291 # Note c3(r11) is NOT set to 0 292 # but will be. 293 294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 295 # sqr_add_c2(a,1,0,c2,c3,c1); 296 $LD r6,`1*$BNSZ`(r4) 297 $UMULL r7,r5,r6 298 $UMULH r8,r5,r6 299 300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 301 adde r8,r8,r8 302 addze r9,r0 # catch carry if any. 303 # r9= r0(=0) and carry 304 305 addc r10,r7,r10 # now add to temp result. 306 addze r11,r8 # r8 added to r11 which is 0 307 addze r9,r9 308 309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 310 #sqr_add_c(a,1,c3,c1,c2) 311 $UMULL r7,r6,r6 312 $UMULH r8,r6,r6 313 addc r11,r7,r11 314 adde r9,r8,r9 315 addze r10,r0 316 #sqr_add_c2(a,2,0,c3,c1,c2) 317 $LD r6,`2*$BNSZ`(r4) 318 $UMULL r7,r5,r6 319 $UMULH r8,r5,r6 320 321 addc r7,r7,r7 322 adde r8,r8,r8 323 addze r10,r10 324 325 addc r11,r7,r11 326 adde r9,r8,r9 327 addze r10,r10 328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 329 #sqr_add_c2(a,3,0,c1,c2,c3); 330 $LD r6,`3*$BNSZ`(r4) 331 $UMULL r7,r5,r6 332 $UMULH r8,r5,r6 333 addc r7,r7,r7 334 adde r8,r8,r8 335 addze r11,r0 336 337 addc r9,r7,r9 338 adde r10,r8,r10 339 addze r11,r11 340 #sqr_add_c2(a,2,1,c1,c2,c3); 341 $LD r5,`1*$BNSZ`(r4) 342 $LD r6,`2*$BNSZ`(r4) 343 $UMULL r7,r5,r6 344 $UMULH r8,r5,r6 345 346 addc r7,r7,r7 347 adde r8,r8,r8 348 addze r11,r11 349 addc r9,r7,r9 350 adde r10,r8,r10 351 addze r11,r11 352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 353 #sqr_add_c(a,2,c2,c3,c1); 354 $UMULL r7,r6,r6 355 $UMULH r8,r6,r6 356 addc r10,r7,r10 357 adde r11,r8,r11 358 addze r9,r0 359 #sqr_add_c2(a,3,1,c2,c3,c1); 360 $LD r6,`3*$BNSZ`(r4) 361 $UMULL r7,r5,r6 362 $UMULH r8,r5,r6 363 addc r7,r7,r7 364 adde r8,r8,r8 365 addze r9,r9 366 367 addc r10,r7,r10 368 adde r11,r8,r11 369 addze r9,r9 370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 371 #sqr_add_c2(a,3,2,c3,c1,c2); 372 $LD r5,`2*$BNSZ`(r4) 373 $UMULL r7,r5,r6 374 $UMULH r8,r5,r6 375 addc r7,r7,r7 376 adde r8,r8,r8 377 addze r10,r0 378 379 addc r11,r7,r11 380 adde r9,r8,r9 381 addze r10,r10 382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 383 #sqr_add_c(a,3,c1,c2,c3); 384 $UMULL r7,r6,r6 385 $UMULH r8,r6,r6 386 addc r9,r7,r9 387 adde r10,r8,r10 388 389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 391 blr 392 .long 0 393 .byte 0,12,0x14,0,0,0,2,0 394 .long 0 395 396 # 397 # NOTE: The following label name should be changed to 398 # "bn_sqr_comba8" i.e. remove the first dot 399 # for the gcc compiler. This should be automatically 400 # done in the build 401 # 402 403 .align 4 404 .bn_sqr_comba8: 405 # 406 # This is an optimized version of the bn_sqr_comba8 routine. 407 # Tightly uses the adde instruction 408 # 409 # 410 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 411 # r3 contains r 412 # r4 contains a 413 # 414 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 415 # 416 # r5,r6 are the two BN_ULONGs being multiplied. 417 # r7,r8 are the results of the 32x32 giving 64 bit multiply. 418 # r9,r10, r11 are the equivalents of c1,c2, c3. 419 # 420 # Possible optimization of loading all 8 longs of a into registers 421 # doesnt provide any speedup 422 # 423 424 xor r0,r0,r0 #set r0 = 0.Used in addze 425 #instructions below. 426 427 #sqr_add_c(a,0,c1,c2,c3); 428 $LD r5,`0*$BNSZ`(r4) 429 $UMULL r9,r5,r5 #1st iteration: no carries. 430 $UMULH r10,r5,r5 431 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 432 #sqr_add_c2(a,1,0,c2,c3,c1); 433 $LD r6,`1*$BNSZ`(r4) 434 $UMULL r7,r5,r6 435 $UMULH r8,r5,r6 436 437 addc r10,r7,r10 #add the two register number 438 adde r11,r8,r0 # (r8,r7) to the three register 439 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 440 441 addc r10,r7,r10 #add the two register number 442 adde r11,r8,r11 # (r8,r7) to the three register 443 addze r9,r9 # number (r9,r11,r10). 444 445 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 446 447 #sqr_add_c(a,1,c3,c1,c2); 448 $UMULL r7,r6,r6 449 $UMULH r8,r6,r6 450 addc r11,r7,r11 451 adde r9,r8,r9 452 addze r10,r0 453 #sqr_add_c2(a,2,0,c3,c1,c2); 454 $LD r6,`2*$BNSZ`(r4) 455 $UMULL r7,r5,r6 456 $UMULH r8,r5,r6 457 458 addc r11,r7,r11 459 adde r9,r8,r9 460 addze r10,r10 461 462 addc r11,r7,r11 463 adde r9,r8,r9 464 addze r10,r10 465 466 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 467 #sqr_add_c2(a,3,0,c1,c2,c3); 468 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 469 $UMULL r7,r5,r6 470 $UMULH r8,r5,r6 471 472 addc r9,r7,r9 473 adde r10,r8,r10 474 addze r11,r0 475 476 addc r9,r7,r9 477 adde r10,r8,r10 478 addze r11,r11 479 #sqr_add_c2(a,2,1,c1,c2,c3); 480 $LD r5,`1*$BNSZ`(r4) 481 $LD r6,`2*$BNSZ`(r4) 482 $UMULL r7,r5,r6 483 $UMULH r8,r5,r6 484 485 addc r9,r7,r9 486 adde r10,r8,r10 487 addze r11,r11 488 489 addc r9,r7,r9 490 adde r10,r8,r10 491 addze r11,r11 492 493 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 494 #sqr_add_c(a,2,c2,c3,c1); 495 $UMULL r7,r6,r6 496 $UMULH r8,r6,r6 497 498 addc r10,r7,r10 499 adde r11,r8,r11 500 addze r9,r0 501 #sqr_add_c2(a,3,1,c2,c3,c1); 502 $LD r6,`3*$BNSZ`(r4) 503 $UMULL r7,r5,r6 504 $UMULH r8,r5,r6 505 506 addc r10,r7,r10 507 adde r11,r8,r11 508 addze r9,r9 509 510 addc r10,r7,r10 511 adde r11,r8,r11 512 addze r9,r9 513 #sqr_add_c2(a,4,0,c2,c3,c1); 514 $LD r5,`0*$BNSZ`(r4) 515 $LD r6,`4*$BNSZ`(r4) 516 $UMULL r7,r5,r6 517 $UMULH r8,r5,r6 518 519 addc r10,r7,r10 520 adde r11,r8,r11 521 addze r9,r9 522 523 addc r10,r7,r10 524 adde r11,r8,r11 525 addze r9,r9 526 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 527 #sqr_add_c2(a,5,0,c3,c1,c2); 528 $LD r6,`5*$BNSZ`(r4) 529 $UMULL r7,r5,r6 530 $UMULH r8,r5,r6 531 532 addc r11,r7,r11 533 adde r9,r8,r9 534 addze r10,r0 535 536 addc r11,r7,r11 537 adde r9,r8,r9 538 addze r10,r10 539 #sqr_add_c2(a,4,1,c3,c1,c2); 540 $LD r5,`1*$BNSZ`(r4) 541 $LD r6,`4*$BNSZ`(r4) 542 $UMULL r7,r5,r6 543 $UMULH r8,r5,r6 544 545 addc r11,r7,r11 546 adde r9,r8,r9 547 addze r10,r10 548 549 addc r11,r7,r11 550 adde r9,r8,r9 551 addze r10,r10 552 #sqr_add_c2(a,3,2,c3,c1,c2); 553 $LD r5,`2*$BNSZ`(r4) 554 $LD r6,`3*$BNSZ`(r4) 555 $UMULL r7,r5,r6 556 $UMULH r8,r5,r6 557 558 addc r11,r7,r11 559 adde r9,r8,r9 560 addze r10,r10 561 562 addc r11,r7,r11 563 adde r9,r8,r9 564 addze r10,r10 565 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 566 #sqr_add_c(a,3,c1,c2,c3); 567 $UMULL r7,r6,r6 568 $UMULH r8,r6,r6 569 addc r9,r7,r9 570 adde r10,r8,r10 571 addze r11,r0 572 #sqr_add_c2(a,4,2,c1,c2,c3); 573 $LD r6,`4*$BNSZ`(r4) 574 $UMULL r7,r5,r6 575 $UMULH r8,r5,r6 576 577 addc r9,r7,r9 578 adde r10,r8,r10 579 addze r11,r11 580 581 addc r9,r7,r9 582 adde r10,r8,r10 583 addze r11,r11 584 #sqr_add_c2(a,5,1,c1,c2,c3); 585 $LD r5,`1*$BNSZ`(r4) 586 $LD r6,`5*$BNSZ`(r4) 587 $UMULL r7,r5,r6 588 $UMULH r8,r5,r6 589 590 addc r9,r7,r9 591 adde r10,r8,r10 592 addze r11,r11 593 594 addc r9,r7,r9 595 adde r10,r8,r10 596 addze r11,r11 597 #sqr_add_c2(a,6,0,c1,c2,c3); 598 $LD r5,`0*$BNSZ`(r4) 599 $LD r6,`6*$BNSZ`(r4) 600 $UMULL r7,r5,r6 601 $UMULH r8,r5,r6 602 addc r9,r7,r9 603 adde r10,r8,r10 604 addze r11,r11 605 addc r9,r7,r9 606 adde r10,r8,r10 607 addze r11,r11 608 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 609 #sqr_add_c2(a,7,0,c2,c3,c1); 610 $LD r6,`7*$BNSZ`(r4) 611 $UMULL r7,r5,r6 612 $UMULH r8,r5,r6 613 614 addc r10,r7,r10 615 adde r11,r8,r11 616 addze r9,r0 617 addc r10,r7,r10 618 adde r11,r8,r11 619 addze r9,r9 620 #sqr_add_c2(a,6,1,c2,c3,c1); 621 $LD r5,`1*$BNSZ`(r4) 622 $LD r6,`6*$BNSZ`(r4) 623 $UMULL r7,r5,r6 624 $UMULH r8,r5,r6 625 626 addc r10,r7,r10 627 adde r11,r8,r11 628 addze r9,r9 629 addc r10,r7,r10 630 adde r11,r8,r11 631 addze r9,r9 632 #sqr_add_c2(a,5,2,c2,c3,c1); 633 $LD r5,`2*$BNSZ`(r4) 634 $LD r6,`5*$BNSZ`(r4) 635 $UMULL r7,r5,r6 636 $UMULH r8,r5,r6 637 addc r10,r7,r10 638 adde r11,r8,r11 639 addze r9,r9 640 addc r10,r7,r10 641 adde r11,r8,r11 642 addze r9,r9 643 #sqr_add_c2(a,4,3,c2,c3,c1); 644 $LD r5,`3*$BNSZ`(r4) 645 $LD r6,`4*$BNSZ`(r4) 646 $UMULL r7,r5,r6 647 $UMULH r8,r5,r6 648 649 addc r10,r7,r10 650 adde r11,r8,r11 651 addze r9,r9 652 addc r10,r7,r10 653 adde r11,r8,r11 654 addze r9,r9 655 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 656 #sqr_add_c(a,4,c3,c1,c2); 657 $UMULL r7,r6,r6 658 $UMULH r8,r6,r6 659 addc r11,r7,r11 660 adde r9,r8,r9 661 addze r10,r0 662 #sqr_add_c2(a,5,3,c3,c1,c2); 663 $LD r6,`5*$BNSZ`(r4) 664 $UMULL r7,r5,r6 665 $UMULH r8,r5,r6 666 addc r11,r7,r11 667 adde r9,r8,r9 668 addze r10,r10 669 addc r11,r7,r11 670 adde r9,r8,r9 671 addze r10,r10 672 #sqr_add_c2(a,6,2,c3,c1,c2); 673 $LD r5,`2*$BNSZ`(r4) 674 $LD r6,`6*$BNSZ`(r4) 675 $UMULL r7,r5,r6 676 $UMULH r8,r5,r6 677 addc r11,r7,r11 678 adde r9,r8,r9 679 addze r10,r10 680 681 addc r11,r7,r11 682 adde r9,r8,r9 683 addze r10,r10 684 #sqr_add_c2(a,7,1,c3,c1,c2); 685 $LD r5,`1*$BNSZ`(r4) 686 $LD r6,`7*$BNSZ`(r4) 687 $UMULL r7,r5,r6 688 $UMULH r8,r5,r6 689 addc r11,r7,r11 690 adde r9,r8,r9 691 addze r10,r10 692 addc r11,r7,r11 693 adde r9,r8,r9 694 addze r10,r10 695 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 696 #sqr_add_c2(a,7,2,c1,c2,c3); 697 $LD r5,`2*$BNSZ`(r4) 698 $UMULL r7,r5,r6 699 $UMULH r8,r5,r6 700 701 addc r9,r7,r9 702 adde r10,r8,r10 703 addze r11,r0 704 addc r9,r7,r9 705 adde r10,r8,r10 706 addze r11,r11 707 #sqr_add_c2(a,6,3,c1,c2,c3); 708 $LD r5,`3*$BNSZ`(r4) 709 $LD r6,`6*$BNSZ`(r4) 710 $UMULL r7,r5,r6 711 $UMULH r8,r5,r6 712 addc r9,r7,r9 713 adde r10,r8,r10 714 addze r11,r11 715 addc r9,r7,r9 716 adde r10,r8,r10 717 addze r11,r11 718 #sqr_add_c2(a,5,4,c1,c2,c3); 719 $LD r5,`4*$BNSZ`(r4) 720 $LD r6,`5*$BNSZ`(r4) 721 $UMULL r7,r5,r6 722 $UMULH r8,r5,r6 723 addc r9,r7,r9 724 adde r10,r8,r10 725 addze r11,r11 726 addc r9,r7,r9 727 adde r10,r8,r10 728 addze r11,r11 729 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 730 #sqr_add_c(a,5,c2,c3,c1); 731 $UMULL r7,r6,r6 732 $UMULH r8,r6,r6 733 addc r10,r7,r10 734 adde r11,r8,r11 735 addze r9,r0 736 #sqr_add_c2(a,6,4,c2,c3,c1); 737 $LD r6,`6*$BNSZ`(r4) 738 $UMULL r7,r5,r6 739 $UMULH r8,r5,r6 740 addc r10,r7,r10 741 adde r11,r8,r11 742 addze r9,r9 743 addc r10,r7,r10 744 adde r11,r8,r11 745 addze r9,r9 746 #sqr_add_c2(a,7,3,c2,c3,c1); 747 $LD r5,`3*$BNSZ`(r4) 748 $LD r6,`7*$BNSZ`(r4) 749 $UMULL r7,r5,r6 750 $UMULH r8,r5,r6 751 addc r10,r7,r10 752 adde r11,r8,r11 753 addze r9,r9 754 addc r10,r7,r10 755 adde r11,r8,r11 756 addze r9,r9 757 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 758 #sqr_add_c2(a,7,4,c3,c1,c2); 759 $LD r5,`4*$BNSZ`(r4) 760 $UMULL r7,r5,r6 761 $UMULH r8,r5,r6 762 addc r11,r7,r11 763 adde r9,r8,r9 764 addze r10,r0 765 addc r11,r7,r11 766 adde r9,r8,r9 767 addze r10,r10 768 #sqr_add_c2(a,6,5,c3,c1,c2); 769 $LD r5,`5*$BNSZ`(r4) 770 $LD r6,`6*$BNSZ`(r4) 771 $UMULL r7,r5,r6 772 $UMULH r8,r5,r6 773 addc r11,r7,r11 774 adde r9,r8,r9 775 addze r10,r10 776 addc r11,r7,r11 777 adde r9,r8,r9 778 addze r10,r10 779 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 780 #sqr_add_c(a,6,c1,c2,c3); 781 $UMULL r7,r6,r6 782 $UMULH r8,r6,r6 783 addc r9,r7,r9 784 adde r10,r8,r10 785 addze r11,r0 786 #sqr_add_c2(a,7,5,c1,c2,c3) 787 $LD r6,`7*$BNSZ`(r4) 788 $UMULL r7,r5,r6 789 $UMULH r8,r5,r6 790 addc r9,r7,r9 791 adde r10,r8,r10 792 addze r11,r11 793 addc r9,r7,r9 794 adde r10,r8,r10 795 addze r11,r11 796 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 797 798 #sqr_add_c2(a,7,6,c2,c3,c1) 799 $LD r5,`6*$BNSZ`(r4) 800 $UMULL r7,r5,r6 801 $UMULH r8,r5,r6 802 addc r10,r7,r10 803 adde r11,r8,r11 804 addze r9,r0 805 addc r10,r7,r10 806 adde r11,r8,r11 807 addze r9,r9 808 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 809 #sqr_add_c(a,7,c3,c1,c2); 810 $UMULL r7,r6,r6 811 $UMULH r8,r6,r6 812 addc r11,r7,r11 813 adde r9,r8,r9 814 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 815 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 816 817 818 blr 819 .long 0 820 .byte 0,12,0x14,0,0,0,2,0 821 .long 0 822 823 # 824 # NOTE: The following label name should be changed to 825 # "bn_mul_comba4" i.e. remove the first dot 826 # for the gcc compiler. This should be automatically 827 # done in the build 828 # 829 830 .align 4 831 .bn_mul_comba4: 832 # 833 # This is an optimized version of the bn_mul_comba4 routine. 834 # 835 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 836 # r3 contains r 837 # r4 contains a 838 # r5 contains b 839 # r6, r7 are the 2 BN_ULONGs being multiplied. 840 # r8, r9 are the results of the 32x32 giving 64 multiply. 841 # r10, r11, r12 are the equivalents of c1, c2, and c3. 842 # 843 xor r0,r0,r0 #r0=0. Used in addze below. 844 #mul_add_c(a[0],b[0],c1,c2,c3); 845 $LD r6,`0*$BNSZ`(r4) 846 $LD r7,`0*$BNSZ`(r5) 847 $UMULL r10,r6,r7 848 $UMULH r11,r6,r7 849 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 850 #mul_add_c(a[0],b[1],c2,c3,c1); 851 $LD r7,`1*$BNSZ`(r5) 852 $UMULL r8,r6,r7 853 $UMULH r9,r6,r7 854 addc r11,r8,r11 855 adde r12,r9,r0 856 addze r10,r0 857 #mul_add_c(a[1],b[0],c2,c3,c1); 858 $LD r6, `1*$BNSZ`(r4) 859 $LD r7, `0*$BNSZ`(r5) 860 $UMULL r8,r6,r7 861 $UMULH r9,r6,r7 862 addc r11,r8,r11 863 adde r12,r9,r12 864 addze r10,r10 865 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 866 #mul_add_c(a[2],b[0],c3,c1,c2); 867 $LD r6,`2*$BNSZ`(r4) 868 $UMULL r8,r6,r7 869 $UMULH r9,r6,r7 870 addc r12,r8,r12 871 adde r10,r9,r10 872 addze r11,r0 873 #mul_add_c(a[1],b[1],c3,c1,c2); 874 $LD r6,`1*$BNSZ`(r4) 875 $LD r7,`1*$BNSZ`(r5) 876 $UMULL r8,r6,r7 877 $UMULH r9,r6,r7 878 addc r12,r8,r12 879 adde r10,r9,r10 880 addze r11,r11 881 #mul_add_c(a[0],b[2],c3,c1,c2); 882 $LD r6,`0*$BNSZ`(r4) 883 $LD r7,`2*$BNSZ`(r5) 884 $UMULL r8,r6,r7 885 $UMULH r9,r6,r7 886 addc r12,r8,r12 887 adde r10,r9,r10 888 addze r11,r11 889 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 890 #mul_add_c(a[0],b[3],c1,c2,c3); 891 $LD r7,`3*$BNSZ`(r5) 892 $UMULL r8,r6,r7 893 $UMULH r9,r6,r7 894 addc r10,r8,r10 895 adde r11,r9,r11 896 addze r12,r0 897 #mul_add_c(a[1],b[2],c1,c2,c3); 898 $LD r6,`1*$BNSZ`(r4) 899 $LD r7,`2*$BNSZ`(r5) 900 $UMULL r8,r6,r7 901 $UMULH r9,r6,r7 902 addc r10,r8,r10 903 adde r11,r9,r11 904 addze r12,r12 905 #mul_add_c(a[2],b[1],c1,c2,c3); 906 $LD r6,`2*$BNSZ`(r4) 907 $LD r7,`1*$BNSZ`(r5) 908 $UMULL r8,r6,r7 909 $UMULH r9,r6,r7 910 addc r10,r8,r10 911 adde r11,r9,r11 912 addze r12,r12 913 #mul_add_c(a[3],b[0],c1,c2,c3); 914 $LD r6,`3*$BNSZ`(r4) 915 $LD r7,`0*$BNSZ`(r5) 916 $UMULL r8,r6,r7 917 $UMULH r9,r6,r7 918 addc r10,r8,r10 919 adde r11,r9,r11 920 addze r12,r12 921 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 922 #mul_add_c(a[3],b[1],c2,c3,c1); 923 $LD r7,`1*$BNSZ`(r5) 924 $UMULL r8,r6,r7 925 $UMULH r9,r6,r7 926 addc r11,r8,r11 927 adde r12,r9,r12 928 addze r10,r0 929 #mul_add_c(a[2],b[2],c2,c3,c1); 930 $LD r6,`2*$BNSZ`(r4) 931 $LD r7,`2*$BNSZ`(r5) 932 $UMULL r8,r6,r7 933 $UMULH r9,r6,r7 934 addc r11,r8,r11 935 adde r12,r9,r12 936 addze r10,r10 937 #mul_add_c(a[1],b[3],c2,c3,c1); 938 $LD r6,`1*$BNSZ`(r4) 939 $LD r7,`3*$BNSZ`(r5) 940 $UMULL r8,r6,r7 941 $UMULH r9,r6,r7 942 addc r11,r8,r11 943 adde r12,r9,r12 944 addze r10,r10 945 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 946 #mul_add_c(a[2],b[3],c3,c1,c2); 947 $LD r6,`2*$BNSZ`(r4) 948 $UMULL r8,r6,r7 949 $UMULH r9,r6,r7 950 addc r12,r8,r12 951 adde r10,r9,r10 952 addze r11,r0 953 #mul_add_c(a[3],b[2],c3,c1,c2); 954 $LD r6,`3*$BNSZ`(r4) 955 $LD r7,`2*$BNSZ`(r5) 956 $UMULL r8,r6,r7 957 $UMULH r9,r6,r7 958 addc r12,r8,r12 959 adde r10,r9,r10 960 addze r11,r11 961 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 962 #mul_add_c(a[3],b[3],c1,c2,c3); 963 $LD r7,`3*$BNSZ`(r5) 964 $UMULL r8,r6,r7 965 $UMULH r9,r6,r7 966 addc r10,r8,r10 967 adde r11,r9,r11 968 969 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 970 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 971 blr 972 .long 0 973 .byte 0,12,0x14,0,0,0,3,0 974 .long 0 975 976 # 977 # NOTE: The following label name should be changed to 978 # "bn_mul_comba8" i.e. remove the first dot 979 # for the gcc compiler. This should be automatically 980 # done in the build 981 # 982 983 .align 4 984 .bn_mul_comba8: 985 # 986 # Optimized version of the bn_mul_comba8 routine. 987 # 988 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 989 # r3 contains r 990 # r4 contains a 991 # r5 contains b 992 # r6, r7 are the 2 BN_ULONGs being multiplied. 993 # r8, r9 are the results of the 32x32 giving 64 multiply. 994 # r10, r11, r12 are the equivalents of c1, c2, and c3. 995 # 996 xor r0,r0,r0 #r0=0. Used in addze below. 997 998 #mul_add_c(a[0],b[0],c1,c2,c3); 999 $LD r6,`0*$BNSZ`(r4) #a[0] 1000 $LD r7,`0*$BNSZ`(r5) #b[0] 1001 $UMULL r10,r6,r7 1002 $UMULH r11,r6,r7 1003 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1004 #mul_add_c(a[0],b[1],c2,c3,c1); 1005 $LD r7,`1*$BNSZ`(r5) 1006 $UMULL r8,r6,r7 1007 $UMULH r9,r6,r7 1008 addc r11,r11,r8 1009 addze r12,r9 # since we didnt set r12 to zero before. 1010 addze r10,r0 1011 #mul_add_c(a[1],b[0],c2,c3,c1); 1012 $LD r6,`1*$BNSZ`(r4) 1013 $LD r7,`0*$BNSZ`(r5) 1014 $UMULL r8,r6,r7 1015 $UMULH r9,r6,r7 1016 addc r11,r11,r8 1017 adde r12,r12,r9 1018 addze r10,r10 1019 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1020 #mul_add_c(a[2],b[0],c3,c1,c2); 1021 $LD r6,`2*$BNSZ`(r4) 1022 $UMULL r8,r6,r7 1023 $UMULH r9,r6,r7 1024 addc r12,r12,r8 1025 adde r10,r10,r9 1026 addze r11,r0 1027 #mul_add_c(a[1],b[1],c3,c1,c2); 1028 $LD r6,`1*$BNSZ`(r4) 1029 $LD r7,`1*$BNSZ`(r5) 1030 $UMULL r8,r6,r7 1031 $UMULH r9,r6,r7 1032 addc r12,r12,r8 1033 adde r10,r10,r9 1034 addze r11,r11 1035 #mul_add_c(a[0],b[2],c3,c1,c2); 1036 $LD r6,`0*$BNSZ`(r4) 1037 $LD r7,`2*$BNSZ`(r5) 1038 $UMULL r8,r6,r7 1039 $UMULH r9,r6,r7 1040 addc r12,r12,r8 1041 adde r10,r10,r9 1042 addze r11,r11 1043 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1044 #mul_add_c(a[0],b[3],c1,c2,c3); 1045 $LD r7,`3*$BNSZ`(r5) 1046 $UMULL r8,r6,r7 1047 $UMULH r9,r6,r7 1048 addc r10,r10,r8 1049 adde r11,r11,r9 1050 addze r12,r0 1051 #mul_add_c(a[1],b[2],c1,c2,c3); 1052 $LD r6,`1*$BNSZ`(r4) 1053 $LD r7,`2*$BNSZ`(r5) 1054 $UMULL r8,r6,r7 1055 $UMULH r9,r6,r7 1056 addc r10,r10,r8 1057 adde r11,r11,r9 1058 addze r12,r12 1059 1060 #mul_add_c(a[2],b[1],c1,c2,c3); 1061 $LD r6,`2*$BNSZ`(r4) 1062 $LD r7,`1*$BNSZ`(r5) 1063 $UMULL r8,r6,r7 1064 $UMULH r9,r6,r7 1065 addc r10,r10,r8 1066 adde r11,r11,r9 1067 addze r12,r12 1068 #mul_add_c(a[3],b[0],c1,c2,c3); 1069 $LD r6,`3*$BNSZ`(r4) 1070 $LD r7,`0*$BNSZ`(r5) 1071 $UMULL r8,r6,r7 1072 $UMULH r9,r6,r7 1073 addc r10,r10,r8 1074 adde r11,r11,r9 1075 addze r12,r12 1076 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1077 #mul_add_c(a[4],b[0],c2,c3,c1); 1078 $LD r6,`4*$BNSZ`(r4) 1079 $UMULL r8,r6,r7 1080 $UMULH r9,r6,r7 1081 addc r11,r11,r8 1082 adde r12,r12,r9 1083 addze r10,r0 1084 #mul_add_c(a[3],b[1],c2,c3,c1); 1085 $LD r6,`3*$BNSZ`(r4) 1086 $LD r7,`1*$BNSZ`(r5) 1087 $UMULL r8,r6,r7 1088 $UMULH r9,r6,r7 1089 addc r11,r11,r8 1090 adde r12,r12,r9 1091 addze r10,r10 1092 #mul_add_c(a[2],b[2],c2,c3,c1); 1093 $LD r6,`2*$BNSZ`(r4) 1094 $LD r7,`2*$BNSZ`(r5) 1095 $UMULL r8,r6,r7 1096 $UMULH r9,r6,r7 1097 addc r11,r11,r8 1098 adde r12,r12,r9 1099 addze r10,r10 1100 #mul_add_c(a[1],b[3],c2,c3,c1); 1101 $LD r6,`1*$BNSZ`(r4) 1102 $LD r7,`3*$BNSZ`(r5) 1103 $UMULL r8,r6,r7 1104 $UMULH r9,r6,r7 1105 addc r11,r11,r8 1106 adde r12,r12,r9 1107 addze r10,r10 1108 #mul_add_c(a[0],b[4],c2,c3,c1); 1109 $LD r6,`0*$BNSZ`(r4) 1110 $LD r7,`4*$BNSZ`(r5) 1111 $UMULL r8,r6,r7 1112 $UMULH r9,r6,r7 1113 addc r11,r11,r8 1114 adde r12,r12,r9 1115 addze r10,r10 1116 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1117 #mul_add_c(a[0],b[5],c3,c1,c2); 1118 $LD r7,`5*$BNSZ`(r5) 1119 $UMULL r8,r6,r7 1120 $UMULH r9,r6,r7 1121 addc r12,r12,r8 1122 adde r10,r10,r9 1123 addze r11,r0 1124 #mul_add_c(a[1],b[4],c3,c1,c2); 1125 $LD r6,`1*$BNSZ`(r4) 1126 $LD r7,`4*$BNSZ`(r5) 1127 $UMULL r8,r6,r7 1128 $UMULH r9,r6,r7 1129 addc r12,r12,r8 1130 adde r10,r10,r9 1131 addze r11,r11 1132 #mul_add_c(a[2],b[3],c3,c1,c2); 1133 $LD r6,`2*$BNSZ`(r4) 1134 $LD r7,`3*$BNSZ`(r5) 1135 $UMULL r8,r6,r7 1136 $UMULH r9,r6,r7 1137 addc r12,r12,r8 1138 adde r10,r10,r9 1139 addze r11,r11 1140 #mul_add_c(a[3],b[2],c3,c1,c2); 1141 $LD r6,`3*$BNSZ`(r4) 1142 $LD r7,`2*$BNSZ`(r5) 1143 $UMULL r8,r6,r7 1144 $UMULH r9,r6,r7 1145 addc r12,r12,r8 1146 adde r10,r10,r9 1147 addze r11,r11 1148 #mul_add_c(a[4],b[1],c3,c1,c2); 1149 $LD r6,`4*$BNSZ`(r4) 1150 $LD r7,`1*$BNSZ`(r5) 1151 $UMULL r8,r6,r7 1152 $UMULH r9,r6,r7 1153 addc r12,r12,r8 1154 adde r10,r10,r9 1155 addze r11,r11 1156 #mul_add_c(a[5],b[0],c3,c1,c2); 1157 $LD r6,`5*$BNSZ`(r4) 1158 $LD r7,`0*$BNSZ`(r5) 1159 $UMULL r8,r6,r7 1160 $UMULH r9,r6,r7 1161 addc r12,r12,r8 1162 adde r10,r10,r9 1163 addze r11,r11 1164 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1165 #mul_add_c(a[6],b[0],c1,c2,c3); 1166 $LD r6,`6*$BNSZ`(r4) 1167 $UMULL r8,r6,r7 1168 $UMULH r9,r6,r7 1169 addc r10,r10,r8 1170 adde r11,r11,r9 1171 addze r12,r0 1172 #mul_add_c(a[5],b[1],c1,c2,c3); 1173 $LD r6,`5*$BNSZ`(r4) 1174 $LD r7,`1*$BNSZ`(r5) 1175 $UMULL r8,r6,r7 1176 $UMULH r9,r6,r7 1177 addc r10,r10,r8 1178 adde r11,r11,r9 1179 addze r12,r12 1180 #mul_add_c(a[4],b[2],c1,c2,c3); 1181 $LD r6,`4*$BNSZ`(r4) 1182 $LD r7,`2*$BNSZ`(r5) 1183 $UMULL r8,r6,r7 1184 $UMULH r9,r6,r7 1185 addc r10,r10,r8 1186 adde r11,r11,r9 1187 addze r12,r12 1188 #mul_add_c(a[3],b[3],c1,c2,c3); 1189 $LD r6,`3*$BNSZ`(r4) 1190 $LD r7,`3*$BNSZ`(r5) 1191 $UMULL r8,r6,r7 1192 $UMULH r9,r6,r7 1193 addc r10,r10,r8 1194 adde r11,r11,r9 1195 addze r12,r12 1196 #mul_add_c(a[2],b[4],c1,c2,c3); 1197 $LD r6,`2*$BNSZ`(r4) 1198 $LD r7,`4*$BNSZ`(r5) 1199 $UMULL r8,r6,r7 1200 $UMULH r9,r6,r7 1201 addc r10,r10,r8 1202 adde r11,r11,r9 1203 addze r12,r12 1204 #mul_add_c(a[1],b[5],c1,c2,c3); 1205 $LD r6,`1*$BNSZ`(r4) 1206 $LD r7,`5*$BNSZ`(r5) 1207 $UMULL r8,r6,r7 1208 $UMULH r9,r6,r7 1209 addc r10,r10,r8 1210 adde r11,r11,r9 1211 addze r12,r12 1212 #mul_add_c(a[0],b[6],c1,c2,c3); 1213 $LD r6,`0*$BNSZ`(r4) 1214 $LD r7,`6*$BNSZ`(r5) 1215 $UMULL r8,r6,r7 1216 $UMULH r9,r6,r7 1217 addc r10,r10,r8 1218 adde r11,r11,r9 1219 addze r12,r12 1220 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1221 #mul_add_c(a[0],b[7],c2,c3,c1); 1222 $LD r7,`7*$BNSZ`(r5) 1223 $UMULL r8,r6,r7 1224 $UMULH r9,r6,r7 1225 addc r11,r11,r8 1226 adde r12,r12,r9 1227 addze r10,r0 1228 #mul_add_c(a[1],b[6],c2,c3,c1); 1229 $LD r6,`1*$BNSZ`(r4) 1230 $LD r7,`6*$BNSZ`(r5) 1231 $UMULL r8,r6,r7 1232 $UMULH r9,r6,r7 1233 addc r11,r11,r8 1234 adde r12,r12,r9 1235 addze r10,r10 1236 #mul_add_c(a[2],b[5],c2,c3,c1); 1237 $LD r6,`2*$BNSZ`(r4) 1238 $LD r7,`5*$BNSZ`(r5) 1239 $UMULL r8,r6,r7 1240 $UMULH r9,r6,r7 1241 addc r11,r11,r8 1242 adde r12,r12,r9 1243 addze r10,r10 1244 #mul_add_c(a[3],b[4],c2,c3,c1); 1245 $LD r6,`3*$BNSZ`(r4) 1246 $LD r7,`4*$BNSZ`(r5) 1247 $UMULL r8,r6,r7 1248 $UMULH r9,r6,r7 1249 addc r11,r11,r8 1250 adde r12,r12,r9 1251 addze r10,r10 1252 #mul_add_c(a[4],b[3],c2,c3,c1); 1253 $LD r6,`4*$BNSZ`(r4) 1254 $LD r7,`3*$BNSZ`(r5) 1255 $UMULL r8,r6,r7 1256 $UMULH r9,r6,r7 1257 addc r11,r11,r8 1258 adde r12,r12,r9 1259 addze r10,r10 1260 #mul_add_c(a[5],b[2],c2,c3,c1); 1261 $LD r6,`5*$BNSZ`(r4) 1262 $LD r7,`2*$BNSZ`(r5) 1263 $UMULL r8,r6,r7 1264 $UMULH r9,r6,r7 1265 addc r11,r11,r8 1266 adde r12,r12,r9 1267 addze r10,r10 1268 #mul_add_c(a[6],b[1],c2,c3,c1); 1269 $LD r6,`6*$BNSZ`(r4) 1270 $LD r7,`1*$BNSZ`(r5) 1271 $UMULL r8,r6,r7 1272 $UMULH r9,r6,r7 1273 addc r11,r11,r8 1274 adde r12,r12,r9 1275 addze r10,r10 1276 #mul_add_c(a[7],b[0],c2,c3,c1); 1277 $LD r6,`7*$BNSZ`(r4) 1278 $LD r7,`0*$BNSZ`(r5) 1279 $UMULL r8,r6,r7 1280 $UMULH r9,r6,r7 1281 addc r11,r11,r8 1282 adde r12,r12,r9 1283 addze r10,r10 1284 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1285 #mul_add_c(a[7],b[1],c3,c1,c2); 1286 $LD r7,`1*$BNSZ`(r5) 1287 $UMULL r8,r6,r7 1288 $UMULH r9,r6,r7 1289 addc r12,r12,r8 1290 adde r10,r10,r9 1291 addze r11,r0 1292 #mul_add_c(a[6],b[2],c3,c1,c2); 1293 $LD r6,`6*$BNSZ`(r4) 1294 $LD r7,`2*$BNSZ`(r5) 1295 $UMULL r8,r6,r7 1296 $UMULH r9,r6,r7 1297 addc r12,r12,r8 1298 adde r10,r10,r9 1299 addze r11,r11 1300 #mul_add_c(a[5],b[3],c3,c1,c2); 1301 $LD r6,`5*$BNSZ`(r4) 1302 $LD r7,`3*$BNSZ`(r5) 1303 $UMULL r8,r6,r7 1304 $UMULH r9,r6,r7 1305 addc r12,r12,r8 1306 adde r10,r10,r9 1307 addze r11,r11 1308 #mul_add_c(a[4],b[4],c3,c1,c2); 1309 $LD r6,`4*$BNSZ`(r4) 1310 $LD r7,`4*$BNSZ`(r5) 1311 $UMULL r8,r6,r7 1312 $UMULH r9,r6,r7 1313 addc r12,r12,r8 1314 adde r10,r10,r9 1315 addze r11,r11 1316 #mul_add_c(a[3],b[5],c3,c1,c2); 1317 $LD r6,`3*$BNSZ`(r4) 1318 $LD r7,`5*$BNSZ`(r5) 1319 $UMULL r8,r6,r7 1320 $UMULH r9,r6,r7 1321 addc r12,r12,r8 1322 adde r10,r10,r9 1323 addze r11,r11 1324 #mul_add_c(a[2],b[6],c3,c1,c2); 1325 $LD r6,`2*$BNSZ`(r4) 1326 $LD r7,`6*$BNSZ`(r5) 1327 $UMULL r8,r6,r7 1328 $UMULH r9,r6,r7 1329 addc r12,r12,r8 1330 adde r10,r10,r9 1331 addze r11,r11 1332 #mul_add_c(a[1],b[7],c3,c1,c2); 1333 $LD r6,`1*$BNSZ`(r4) 1334 $LD r7,`7*$BNSZ`(r5) 1335 $UMULL r8,r6,r7 1336 $UMULH r9,r6,r7 1337 addc r12,r12,r8 1338 adde r10,r10,r9 1339 addze r11,r11 1340 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1341 #mul_add_c(a[2],b[7],c1,c2,c3); 1342 $LD r6,`2*$BNSZ`(r4) 1343 $UMULL r8,r6,r7 1344 $UMULH r9,r6,r7 1345 addc r10,r10,r8 1346 adde r11,r11,r9 1347 addze r12,r0 1348 #mul_add_c(a[3],b[6],c1,c2,c3); 1349 $LD r6,`3*$BNSZ`(r4) 1350 $LD r7,`6*$BNSZ`(r5) 1351 $UMULL r8,r6,r7 1352 $UMULH r9,r6,r7 1353 addc r10,r10,r8 1354 adde r11,r11,r9 1355 addze r12,r12 1356 #mul_add_c(a[4],b[5],c1,c2,c3); 1357 $LD r6,`4*$BNSZ`(r4) 1358 $LD r7,`5*$BNSZ`(r5) 1359 $UMULL r8,r6,r7 1360 $UMULH r9,r6,r7 1361 addc r10,r10,r8 1362 adde r11,r11,r9 1363 addze r12,r12 1364 #mul_add_c(a[5],b[4],c1,c2,c3); 1365 $LD r6,`5*$BNSZ`(r4) 1366 $LD r7,`4*$BNSZ`(r5) 1367 $UMULL r8,r6,r7 1368 $UMULH r9,r6,r7 1369 addc r10,r10,r8 1370 adde r11,r11,r9 1371 addze r12,r12 1372 #mul_add_c(a[6],b[3],c1,c2,c3); 1373 $LD r6,`6*$BNSZ`(r4) 1374 $LD r7,`3*$BNSZ`(r5) 1375 $UMULL r8,r6,r7 1376 $UMULH r9,r6,r7 1377 addc r10,r10,r8 1378 adde r11,r11,r9 1379 addze r12,r12 1380 #mul_add_c(a[7],b[2],c1,c2,c3); 1381 $LD r6,`7*$BNSZ`(r4) 1382 $LD r7,`2*$BNSZ`(r5) 1383 $UMULL r8,r6,r7 1384 $UMULH r9,r6,r7 1385 addc r10,r10,r8 1386 adde r11,r11,r9 1387 addze r12,r12 1388 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1389 #mul_add_c(a[7],b[3],c2,c3,c1); 1390 $LD r7,`3*$BNSZ`(r5) 1391 $UMULL r8,r6,r7 1392 $UMULH r9,r6,r7 1393 addc r11,r11,r8 1394 adde r12,r12,r9 1395 addze r10,r0 1396 #mul_add_c(a[6],b[4],c2,c3,c1); 1397 $LD r6,`6*$BNSZ`(r4) 1398 $LD r7,`4*$BNSZ`(r5) 1399 $UMULL r8,r6,r7 1400 $UMULH r9,r6,r7 1401 addc r11,r11,r8 1402 adde r12,r12,r9 1403 addze r10,r10 1404 #mul_add_c(a[5],b[5],c2,c3,c1); 1405 $LD r6,`5*$BNSZ`(r4) 1406 $LD r7,`5*$BNSZ`(r5) 1407 $UMULL r8,r6,r7 1408 $UMULH r9,r6,r7 1409 addc r11,r11,r8 1410 adde r12,r12,r9 1411 addze r10,r10 1412 #mul_add_c(a[4],b[6],c2,c3,c1); 1413 $LD r6,`4*$BNSZ`(r4) 1414 $LD r7,`6*$BNSZ`(r5) 1415 $UMULL r8,r6,r7 1416 $UMULH r9,r6,r7 1417 addc r11,r11,r8 1418 adde r12,r12,r9 1419 addze r10,r10 1420 #mul_add_c(a[3],b[7],c2,c3,c1); 1421 $LD r6,`3*$BNSZ`(r4) 1422 $LD r7,`7*$BNSZ`(r5) 1423 $UMULL r8,r6,r7 1424 $UMULH r9,r6,r7 1425 addc r11,r11,r8 1426 adde r12,r12,r9 1427 addze r10,r10 1428 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1429 #mul_add_c(a[4],b[7],c3,c1,c2); 1430 $LD r6,`4*$BNSZ`(r4) 1431 $UMULL r8,r6,r7 1432 $UMULH r9,r6,r7 1433 addc r12,r12,r8 1434 adde r10,r10,r9 1435 addze r11,r0 1436 #mul_add_c(a[5],b[6],c3,c1,c2); 1437 $LD r6,`5*$BNSZ`(r4) 1438 $LD r7,`6*$BNSZ`(r5) 1439 $UMULL r8,r6,r7 1440 $UMULH r9,r6,r7 1441 addc r12,r12,r8 1442 adde r10,r10,r9 1443 addze r11,r11 1444 #mul_add_c(a[6],b[5],c3,c1,c2); 1445 $LD r6,`6*$BNSZ`(r4) 1446 $LD r7,`5*$BNSZ`(r5) 1447 $UMULL r8,r6,r7 1448 $UMULH r9,r6,r7 1449 addc r12,r12,r8 1450 adde r10,r10,r9 1451 addze r11,r11 1452 #mul_add_c(a[7],b[4],c3,c1,c2); 1453 $LD r6,`7*$BNSZ`(r4) 1454 $LD r7,`4*$BNSZ`(r5) 1455 $UMULL r8,r6,r7 1456 $UMULH r9,r6,r7 1457 addc r12,r12,r8 1458 adde r10,r10,r9 1459 addze r11,r11 1460 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1461 #mul_add_c(a[7],b[5],c1,c2,c3); 1462 $LD r7,`5*$BNSZ`(r5) 1463 $UMULL r8,r6,r7 1464 $UMULH r9,r6,r7 1465 addc r10,r10,r8 1466 adde r11,r11,r9 1467 addze r12,r0 1468 #mul_add_c(a[6],b[6],c1,c2,c3); 1469 $LD r6,`6*$BNSZ`(r4) 1470 $LD r7,`6*$BNSZ`(r5) 1471 $UMULL r8,r6,r7 1472 $UMULH r9,r6,r7 1473 addc r10,r10,r8 1474 adde r11,r11,r9 1475 addze r12,r12 1476 #mul_add_c(a[5],b[7],c1,c2,c3); 1477 $LD r6,`5*$BNSZ`(r4) 1478 $LD r7,`7*$BNSZ`(r5) 1479 $UMULL r8,r6,r7 1480 $UMULH r9,r6,r7 1481 addc r10,r10,r8 1482 adde r11,r11,r9 1483 addze r12,r12 1484 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1485 #mul_add_c(a[6],b[7],c2,c3,c1); 1486 $LD r6,`6*$BNSZ`(r4) 1487 $UMULL r8,r6,r7 1488 $UMULH r9,r6,r7 1489 addc r11,r11,r8 1490 adde r12,r12,r9 1491 addze r10,r0 1492 #mul_add_c(a[7],b[6],c2,c3,c1); 1493 $LD r6,`7*$BNSZ`(r4) 1494 $LD r7,`6*$BNSZ`(r5) 1495 $UMULL r8,r6,r7 1496 $UMULH r9,r6,r7 1497 addc r11,r11,r8 1498 adde r12,r12,r9 1499 addze r10,r10 1500 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1501 #mul_add_c(a[7],b[7],c3,c1,c2); 1502 $LD r7,`7*$BNSZ`(r5) 1503 $UMULL r8,r6,r7 1504 $UMULH r9,r6,r7 1505 addc r12,r12,r8 1506 adde r10,r10,r9 1507 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1508 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1509 blr 1510 .long 0 1511 .byte 0,12,0x14,0,0,0,3,0 1512 .long 0 1513 1514 # 1515 # NOTE: The following label name should be changed to 1516 # "bn_sub_words" i.e. remove the first dot 1517 # for the gcc compiler. This should be automatically 1518 # done in the build 1519 # 1520 # 1521 .align 4 1522 .bn_sub_words: 1523 # 1524 # Handcoded version of bn_sub_words 1525 # 1526 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1527 # 1528 # r3 = r 1529 # r4 = a 1530 # r5 = b 1531 # r6 = n 1532 # 1533 # Note: No loop unrolling done since this is not a performance 1534 # critical loop. 1535 1536 xor r0,r0,r0 #set r0 = 0 1537 # 1538 # check for r6 = 0 AND set carry bit. 1539 # 1540 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1541 # if r6 > 0 then result !=0 1542 # In either case carry bit is set. 1543 beq Lppcasm_sub_adios 1544 addi r4,r4,-$BNSZ 1545 addi r3,r3,-$BNSZ 1546 addi r5,r5,-$BNSZ 1547 mtctr r6 1548 Lppcasm_sub_mainloop: 1549 $LDU r7,$BNSZ(r4) 1550 $LDU r8,$BNSZ(r5) 1551 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1552 # if carry = 1 this is r7-r8. Else it 1553 # is r7-r8 -1 as we need. 1554 $STU r6,$BNSZ(r3) 1555 bdnz- Lppcasm_sub_mainloop 1556 Lppcasm_sub_adios: 1557 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1558 andi. r3,r3,1 # keep only last bit. 1559 blr 1560 .long 0 1561 .byte 0,12,0x14,0,0,0,4,0 1562 .long 0 1563 1564 # 1565 # NOTE: The following label name should be changed to 1566 # "bn_add_words" i.e. remove the first dot 1567 # for the gcc compiler. This should be automatically 1568 # done in the build 1569 # 1570 1571 .align 4 1572 .bn_add_words: 1573 # 1574 # Handcoded version of bn_add_words 1575 # 1576 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1577 # 1578 # r3 = r 1579 # r4 = a 1580 # r5 = b 1581 # r6 = n 1582 # 1583 # Note: No loop unrolling done since this is not a performance 1584 # critical loop. 1585 1586 xor r0,r0,r0 1587 # 1588 # check for r6 = 0. Is this needed? 1589 # 1590 addic. r6,r6,0 #test r6 and clear carry bit. 1591 beq Lppcasm_add_adios 1592 addi r4,r4,-$BNSZ 1593 addi r3,r3,-$BNSZ 1594 addi r5,r5,-$BNSZ 1595 mtctr r6 1596 Lppcasm_add_mainloop: 1597 $LDU r7,$BNSZ(r4) 1598 $LDU r8,$BNSZ(r5) 1599 adde r8,r7,r8 1600 $STU r8,$BNSZ(r3) 1601 bdnz- Lppcasm_add_mainloop 1602 Lppcasm_add_adios: 1603 addze r3,r0 #return carry bit. 1604 blr 1605 .long 0 1606 .byte 0,12,0x14,0,0,0,4,0 1607 .long 0 1608 1609 # 1610 # NOTE: The following label name should be changed to 1611 # "bn_div_words" i.e. remove the first dot 1612 # for the gcc compiler. This should be automatically 1613 # done in the build 1614 # 1615 1616 .align 4 1617 .bn_div_words: 1618 # 1619 # This is a cleaned up version of code generated by 1620 # the AIX compiler. The only optimization is to use 1621 # the PPC instruction to count leading zeros instead 1622 # of call to num_bits_word. Since this was compiled 1623 # only at level -O2 we can possibly squeeze it more? 1624 # 1625 # r3 = h 1626 # r4 = l 1627 # r5 = d 1628 1629 $UCMPI 0,r5,0 # compare r5 and 0 1630 bne Lppcasm_div1 # proceed if d!=0 1631 li r3,-1 # d=0 return -1 1632 blr 1633 Lppcasm_div1: 1634 xor r0,r0,r0 #r0=0 1635 li r8,$BITS 1636 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1637 beq Lppcasm_div2 #proceed if no leading zeros 1638 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1639 $SHR. r9,r3,r8 #are there any bits above r8'th? 1640 $TR 16,r9,r0 #if there're, signal to dump core... 1641 Lppcasm_div2: 1642 $UCMP 0,r3,r5 #h>=d? 1643 blt Lppcasm_div3 #goto Lppcasm_div3 if not 1644 subf r3,r5,r3 #h-=d ; 1645 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1646 cmpi 0,0,r7,0 # is (i == 0)? 1647 beq Lppcasm_div4 1648 $SHL r3,r3,r7 # h = (h<< i) 1649 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1650 $SHL r5,r5,r7 # d<<=i 1651 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1652 $SHL r4,r4,r7 # l <<=i 1653 Lppcasm_div4: 1654 $SHRI r9,r5,`$BITS/2` # r9 = dh 1655 # dl will be computed when needed 1656 # as it saves registers. 1657 li r6,2 #r6=2 1658 mtctr r6 #counter will be in count. 1659 Lppcasm_divouterloop: 1660 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1661 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1662 # compute here for innerloop. 1663 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1664 bne Lppcasm_div5 # goto Lppcasm_div5 if not 1665 1666 li r8,-1 1667 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1668 b Lppcasm_div6 1669 Lppcasm_div5: 1670 $UDIV r8,r3,r9 #q = h/dh 1671 Lppcasm_div6: 1672 $UMULL r12,r9,r8 #th = q*dh 1673 $CLRU r10,r5,`$BITS/2` #r10=dl 1674 $UMULL r6,r8,r10 #tl = q*dl 1675 1676 Lppcasm_divinnerloop: 1677 subf r10,r12,r3 #t = h -th 1678 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1679 addic. r7,r7,0 #test if r7 == 0. used below. 1680 # now want to compute 1681 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1682 # the following 2 instructions do that 1683 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1684 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1685 $UCMP cr1,r6,r7 # compare (tl <= r7) 1686 bne Lppcasm_divinnerexit 1687 ble cr1,Lppcasm_divinnerexit 1688 addi r8,r8,-1 #q-- 1689 subf r12,r9,r12 #th -=dh 1690 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1691 subf r6,r10,r6 #tl -=dl 1692 b Lppcasm_divinnerloop 1693 Lppcasm_divinnerexit: 1694 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1695 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1696 $UCMP cr1,r4,r11 # compare l and tl 1697 add r12,r12,r10 # th+=t 1698 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1699 addi r12,r12,1 # th++ 1700 Lppcasm_div7: 1701 subf r11,r11,r4 #r11=l-tl 1702 $UCMP cr1,r3,r12 #compare h and th 1703 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1704 addi r8,r8,-1 # q-- 1705 add r3,r5,r3 # h+=d 1706 Lppcasm_div8: 1707 subf r12,r12,r3 #r12 = h-th 1708 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1709 # want to compute 1710 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1711 # the following 2 instructions will do this. 1712 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1713 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1714 bdz Lppcasm_div9 #if (count==0) break ; 1715 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1716 b Lppcasm_divouterloop 1717 Lppcasm_div9: 1718 or r3,r8,r0 1719 blr 1720 .long 0 1721 .byte 0,12,0x14,0,0,0,3,0 1722 .long 0 1723 1724 # 1725 # NOTE: The following label name should be changed to 1726 # "bn_sqr_words" i.e. remove the first dot 1727 # for the gcc compiler. This should be automatically 1728 # done in the build 1729 # 1730 .align 4 1731 .bn_sqr_words: 1732 # 1733 # Optimized version of bn_sqr_words 1734 # 1735 # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1736 # 1737 # r3 = r 1738 # r4 = a 1739 # r5 = n 1740 # 1741 # r6 = a[i]. 1742 # r7,r8 = product. 1743 # 1744 # No unrolling done here. Not performance critical. 1745 1746 addic. r5,r5,0 #test r5. 1747 beq Lppcasm_sqr_adios 1748 addi r4,r4,-$BNSZ 1749 addi r3,r3,-$BNSZ 1750 mtctr r5 1751 Lppcasm_sqr_mainloop: 1752 #sqr(r[0],r[1],a[0]); 1753 $LDU r6,$BNSZ(r4) 1754 $UMULL r7,r6,r6 1755 $UMULH r8,r6,r6 1756 $STU r7,$BNSZ(r3) 1757 $STU r8,$BNSZ(r3) 1758 bdnz- Lppcasm_sqr_mainloop 1759 Lppcasm_sqr_adios: 1760 blr 1761 .long 0 1762 .byte 0,12,0x14,0,0,0,3,0 1763 .long 0 1764 1765 # 1766 # NOTE: The following label name should be changed to 1767 # "bn_mul_words" i.e. remove the first dot 1768 # for the gcc compiler. This should be automatically 1769 # done in the build 1770 # 1771 1772 .align 4 1773 .bn_mul_words: 1774 # 1775 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1776 # 1777 # r3 = rp 1778 # r4 = ap 1779 # r5 = num 1780 # r6 = w 1781 xor r0,r0,r0 1782 xor r12,r12,r12 # used for carry 1783 rlwinm. r7,r5,30,2,31 # num >> 2 1784 beq Lppcasm_mw_REM 1785 mtctr r7 1786 Lppcasm_mw_LOOP: 1787 #mul(rp[0],ap[0],w,c1); 1788 $LD r8,`0*$BNSZ`(r4) 1789 $UMULL r9,r6,r8 1790 $UMULH r10,r6,r8 1791 addc r9,r9,r12 1792 #addze r10,r10 #carry is NOT ignored. 1793 #will be taken care of 1794 #in second spin below 1795 #using adde. 1796 $ST r9,`0*$BNSZ`(r3) 1797 #mul(rp[1],ap[1],w,c1); 1798 $LD r8,`1*$BNSZ`(r4) 1799 $UMULL r11,r6,r8 1800 $UMULH r12,r6,r8 1801 adde r11,r11,r10 1802 #addze r12,r12 1803 $ST r11,`1*$BNSZ`(r3) 1804 #mul(rp[2],ap[2],w,c1); 1805 $LD r8,`2*$BNSZ`(r4) 1806 $UMULL r9,r6,r8 1807 $UMULH r10,r6,r8 1808 adde r9,r9,r12 1809 #addze r10,r10 1810 $ST r9,`2*$BNSZ`(r3) 1811 #mul_add(rp[3],ap[3],w,c1); 1812 $LD r8,`3*$BNSZ`(r4) 1813 $UMULL r11,r6,r8 1814 $UMULH r12,r6,r8 1815 adde r11,r11,r10 1816 addze r12,r12 #this spin we collect carry into 1817 #r12 1818 $ST r11,`3*$BNSZ`(r3) 1819 1820 addi r3,r3,`4*$BNSZ` 1821 addi r4,r4,`4*$BNSZ` 1822 bdnz- Lppcasm_mw_LOOP 1823 1824 Lppcasm_mw_REM: 1825 andi. r5,r5,0x3 1826 beq Lppcasm_mw_OVER 1827 #mul(rp[0],ap[0],w,c1); 1828 $LD r8,`0*$BNSZ`(r4) 1829 $UMULL r9,r6,r8 1830 $UMULH r10,r6,r8 1831 addc r9,r9,r12 1832 addze r10,r10 1833 $ST r9,`0*$BNSZ`(r3) 1834 addi r12,r10,0 1835 1836 addi r5,r5,-1 1837 cmpli 0,0,r5,0 1838 beq Lppcasm_mw_OVER 1839 1840 1841 #mul(rp[1],ap[1],w,c1); 1842 $LD r8,`1*$BNSZ`(r4) 1843 $UMULL r9,r6,r8 1844 $UMULH r10,r6,r8 1845 addc r9,r9,r12 1846 addze r10,r10 1847 $ST r9,`1*$BNSZ`(r3) 1848 addi r12,r10,0 1849 1850 addi r5,r5,-1 1851 cmpli 0,0,r5,0 1852 beq Lppcasm_mw_OVER 1853 1854 #mul_add(rp[2],ap[2],w,c1); 1855 $LD r8,`2*$BNSZ`(r4) 1856 $UMULL r9,r6,r8 1857 $UMULH r10,r6,r8 1858 addc r9,r9,r12 1859 addze r10,r10 1860 $ST r9,`2*$BNSZ`(r3) 1861 addi r12,r10,0 1862 1863 Lppcasm_mw_OVER: 1864 addi r3,r12,0 1865 blr 1866 .long 0 1867 .byte 0,12,0x14,0,0,0,4,0 1868 .long 0 1869 1870 # 1871 # NOTE: The following label name should be changed to 1872 # "bn_mul_add_words" i.e. remove the first dot 1873 # for the gcc compiler. This should be automatically 1874 # done in the build 1875 # 1876 1877 .align 4 1878 .bn_mul_add_words: 1879 # 1880 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1881 # 1882 # r3 = rp 1883 # r4 = ap 1884 # r5 = num 1885 # r6 = w 1886 # 1887 # empirical evidence suggests that unrolled version performs best!! 1888 # 1889 xor r0,r0,r0 #r0 = 0 1890 xor r12,r12,r12 #r12 = 0 . used for carry 1891 rlwinm. r7,r5,30,2,31 # num >> 2 1892 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1893 mtctr r7 1894 Lppcasm_maw_mainloop: 1895 #mul_add(rp[0],ap[0],w,c1); 1896 $LD r8,`0*$BNSZ`(r4) 1897 $LD r11,`0*$BNSZ`(r3) 1898 $UMULL r9,r6,r8 1899 $UMULH r10,r6,r8 1900 addc r9,r9,r12 #r12 is carry. 1901 addze r10,r10 1902 addc r9,r9,r11 1903 #addze r10,r10 1904 #the above instruction addze 1905 #is NOT needed. Carry will NOT 1906 #be ignored. It's not affected 1907 #by multiply and will be collected 1908 #in the next spin 1909 $ST r9,`0*$BNSZ`(r3) 1910 1911 #mul_add(rp[1],ap[1],w,c1); 1912 $LD r8,`1*$BNSZ`(r4) 1913 $LD r9,`1*$BNSZ`(r3) 1914 $UMULL r11,r6,r8 1915 $UMULH r12,r6,r8 1916 adde r11,r11,r10 #r10 is carry. 1917 addze r12,r12 1918 addc r11,r11,r9 1919 #addze r12,r12 1920 $ST r11,`1*$BNSZ`(r3) 1921 1922 #mul_add(rp[2],ap[2],w,c1); 1923 $LD r8,`2*$BNSZ`(r4) 1924 $UMULL r9,r6,r8 1925 $LD r11,`2*$BNSZ`(r3) 1926 $UMULH r10,r6,r8 1927 adde r9,r9,r12 1928 addze r10,r10 1929 addc r9,r9,r11 1930 #addze r10,r10 1931 $ST r9,`2*$BNSZ`(r3) 1932 1933 #mul_add(rp[3],ap[3],w,c1); 1934 $LD r8,`3*$BNSZ`(r4) 1935 $UMULL r11,r6,r8 1936 $LD r9,`3*$BNSZ`(r3) 1937 $UMULH r12,r6,r8 1938 adde r11,r11,r10 1939 addze r12,r12 1940 addc r11,r11,r9 1941 addze r12,r12 1942 $ST r11,`3*$BNSZ`(r3) 1943 addi r3,r3,`4*$BNSZ` 1944 addi r4,r4,`4*$BNSZ` 1945 bdnz- Lppcasm_maw_mainloop 1946 1947 Lppcasm_maw_leftover: 1948 andi. r5,r5,0x3 1949 beq Lppcasm_maw_adios 1950 addi r3,r3,-$BNSZ 1951 addi r4,r4,-$BNSZ 1952 #mul_add(rp[0],ap[0],w,c1); 1953 mtctr r5 1954 $LDU r8,$BNSZ(r4) 1955 $UMULL r9,r6,r8 1956 $UMULH r10,r6,r8 1957 $LDU r11,$BNSZ(r3) 1958 addc r9,r9,r11 1959 addze r10,r10 1960 addc r9,r9,r12 1961 addze r12,r10 1962 $ST r9,0(r3) 1963 1964 bdz Lppcasm_maw_adios 1965 #mul_add(rp[1],ap[1],w,c1); 1966 $LDU r8,$BNSZ(r4) 1967 $UMULL r9,r6,r8 1968 $UMULH r10,r6,r8 1969 $LDU r11,$BNSZ(r3) 1970 addc r9,r9,r11 1971 addze r10,r10 1972 addc r9,r9,r12 1973 addze r12,r10 1974 $ST r9,0(r3) 1975 1976 bdz Lppcasm_maw_adios 1977 #mul_add(rp[2],ap[2],w,c1); 1978 $LDU r8,$BNSZ(r4) 1979 $UMULL r9,r6,r8 1980 $UMULH r10,r6,r8 1981 $LDU r11,$BNSZ(r3) 1982 addc r9,r9,r11 1983 addze r10,r10 1984 addc r9,r9,r12 1985 addze r12,r10 1986 $ST r9,0(r3) 1987 1988 Lppcasm_maw_adios: 1989 addi r3,r12,0 1990 blr 1991 .long 0 1992 .byte 0,12,0x14,0,0,0,4,0 1993 .long 0 1994 .align 4 1995 EOF 1996 $data =~ s/\`([^\`]*)\`/eval $1/gem; 1997 print $data; 1998 close STDOUT; 1999