1 ; 2 ; PA-RISC 2.0 implementation of bn_asm code, based on the 3 ; 64-bit version of the code. This code is effectively the 4 ; same as the 64-bit version except the register model is 5 ; slightly different given all values must be 32-bit between 6 ; function calls. Thus the 64-bit return values are returned 7 ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit 8 ; 9 ; 10 ; This code is approximately 2x faster than the C version 11 ; for RSA/DSA. 12 ; 13 ; See http://devresource.hp.com/ for more details on the PA-RISC 14 ; architecture. Also see the book "PA-RISC 2.0 Architecture" 15 ; by Gerry Kane for information on the instruction set architecture. 16 ; 17 ; Code written by Chris Ruemmler (with some help from the HP C 18 ; compiler). 19 ; 20 ; The code compiles with HP's assembler 21 ; 22 23 .level 2.0N 24 .space $TEXT$ 25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY 26 27 ; 28 ; Global Register definitions used for the routines. 29 ; 30 ; Some information about HP's runtime architecture for 32-bits. 31 ; 32 ; "Caller save" means the calling function must save the register 33 ; if it wants the register to be preserved. 34 ; "Callee save" means if a function uses the register, it must save 35 ; the value before using it. 36 ; 37 ; For the floating point registers 38 ; 39 ; "caller save" registers: fr4-fr11, fr22-fr31 40 ; "callee save" registers: fr12-fr21 41 ; "special" registers: fr0-fr3 (status and exception registers) 42 ; 43 ; For the integer registers 44 ; value zero : r0 45 ; "caller save" registers: r1,r19-r26 46 ; "callee save" registers: r3-r18 47 ; return register : r2 (rp) 48 ; return values ; r28,r29 (ret0,ret1) 49 ; Stack pointer ; r30 (sp) 50 ; millicode return ptr ; r31 (also a caller save register) 51 52 53 ; 54 ; Arguments to the routines 55 ; 56 r_ptr .reg %r26 57 a_ptr .reg %r25 58 b_ptr .reg %r24 59 num .reg %r24 60 n .reg %r23 61 62 ; 63 ; Note that the "w" argument for bn_mul_add_words and bn_mul_words 64 ; is passed on the stack at a delta of -56 from the top of stack 65 ; as the routine is entered. 66 ; 67 68 ; 69 ; Globals used in some routines 70 ; 71 72 top_overflow .reg %r23 73 high_mask .reg %r22 ; value 0xffffffff80000000L 74 75 76 ;------------------------------------------------------------------------------ 77 ; 78 ; bn_mul_add_words 79 ; 80 ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, 81 ; int num, BN_ULONG w) 82 ; 83 ; arg0 = r_ptr 84 ; arg1 = a_ptr 85 ; arg3 = num 86 ; -56(sp) = w 87 ; 88 ; Local register definitions 89 ; 90 91 fm1 .reg %fr22 92 fm .reg %fr23 93 ht_temp .reg %fr24 94 ht_temp_1 .reg %fr25 95 lt_temp .reg %fr26 96 lt_temp_1 .reg %fr27 97 fm1_1 .reg %fr28 98 fm_1 .reg %fr29 99 100 fw_h .reg %fr7L 101 fw_l .reg %fr7R 102 fw .reg %fr7 103 104 fht_0 .reg %fr8L 105 flt_0 .reg %fr8R 106 t_float_0 .reg %fr8 107 108 fht_1 .reg %fr9L 109 flt_1 .reg %fr9R 110 t_float_1 .reg %fr9 111 112 tmp_0 .reg %r31 113 tmp_1 .reg %r21 114 m_0 .reg %r20 115 m_1 .reg %r19 116 ht_0 .reg %r1 117 ht_1 .reg %r3 118 lt_0 .reg %r4 119 lt_1 .reg %r5 120 m1_0 .reg %r6 121 m1_1 .reg %r7 122 rp_val .reg %r8 123 rp_val_1 .reg %r9 124 125 bn_mul_add_words 126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN 127 .proc 128 .callinfo frame=128 129 .entry 130 .align 64 131 132 STD %r3,0(%sp) ; save r3 133 STD %r4,8(%sp) ; save r4 134 NOP ; Needed to make the loop 16-byte aligned 135 NOP ; needed to make the loop 16-byte aligned 136 137 STD %r5,16(%sp) ; save r5 138 NOP 139 STD %r6,24(%sp) ; save r6 140 STD %r7,32(%sp) ; save r7 141 142 STD %r8,40(%sp) ; save r8 143 STD %r9,48(%sp) ; save r9 144 COPY %r0,%ret1 ; return 0 by default 145 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 146 147 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit 148 LDO 128(%sp),%sp ; bump stack 149 150 ; 151 ; The loop is unrolled twice, so if there is only 1 number 152 ; then go straight to the cleanup code. 153 ; 154 CMPIB,= 1,num,bn_mul_add_words_single_top 155 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) 156 157 ; 158 ; This loop is unrolled 2 times (64-byte aligned as well) 159 ; 160 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus 161 ; two 32-bit mutiplies can be issued per cycle. 162 ; 163 bn_mul_add_words_unroll2 164 165 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 166 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) 167 LDD 0(r_ptr),rp_val ; rp[0] 168 LDD 8(r_ptr),rp_val_1 ; rp[1] 169 170 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l 171 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l 172 FSTD fm1,-16(%sp) ; -16(sp) = m1[0] 173 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] 174 175 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h 176 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h 177 FSTD fm,-8(%sp) ; -8(sp) = m[0] 178 FSTD fm_1,-40(%sp) ; -40(sp) = m[1] 179 180 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h 181 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h 182 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp 183 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 184 185 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 186 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l 187 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp 188 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 189 190 LDD -8(%sp),m_0 ; m[0] 191 LDD -40(%sp),m_1 ; m[1] 192 LDD -16(%sp),m1_0 ; m1[0] 193 LDD -48(%sp),m1_1 ; m1[1] 194 195 LDD -24(%sp),ht_0 ; ht[0] 196 LDD -56(%sp),ht_1 ; ht[1] 197 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; 198 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; 199 200 LDD -32(%sp),lt_0 201 LDD -64(%sp),lt_1 202 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) 203 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) 204 205 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) 206 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) 207 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 208 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 209 210 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 211 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 212 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) 213 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) 214 215 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; 216 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 217 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; 218 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 219 220 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; 221 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 222 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] 223 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 224 225 LDO -2(num),num ; num = num - 2; 226 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); 227 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 228 STD lt_0,0(r_ptr) ; rp[0] = lt[0] 229 230 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] 231 ADD,DC ht_1,%r0,%ret1 ; ht[1]++ 232 LDO 16(a_ptr),a_ptr ; a_ptr += 2 233 234 STD lt_1,8(r_ptr) ; rp[1] = lt[1] 235 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do 236 LDO 16(r_ptr),r_ptr ; r_ptr += 2 237 238 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one 239 240 ; 241 ; Top of loop aligned on 64-byte boundary 242 ; 243 bn_mul_add_words_single_top 244 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 245 LDD 0(r_ptr),rp_val ; rp[0] 246 LDO 8(a_ptr),a_ptr ; a_ptr++ 247 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l 248 FSTD fm1,-16(%sp) ; -16(sp) = m1 249 XMPYU flt_0,fw_h,fm ; m = lt*fw_h 250 FSTD fm,-8(%sp) ; -8(sp) = m 251 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h 252 FSTD ht_temp,-24(%sp) ; -24(sp) = ht 253 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 254 FSTD lt_temp,-32(%sp) ; -32(sp) = lt 255 256 LDD -8(%sp),m_0 257 LDD -16(%sp),m1_0 ; m1 = temp1 258 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; 259 LDD -24(%sp),ht_0 260 LDD -32(%sp),lt_0 261 262 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) 263 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 264 265 EXTRD,U tmp_0,31,32,m_0 ; m>>32 266 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 267 268 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 269 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; 270 ADD,DC ht_0,%r0,ht_0 ; ht++ 271 ADD %ret1,tmp_0,lt_0 ; lt = lt + c; 272 ADD,DC ht_0,%r0,ht_0 ; ht++ 273 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] 274 ADD,DC ht_0,%r0,%ret1 ; ht++ 275 STD lt_0,0(r_ptr) ; rp[0] = lt 276 277 bn_mul_add_words_exit 278 .EXIT 279 280 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 281 LDD -80(%sp),%r9 ; restore r9 282 LDD -88(%sp),%r8 ; restore r8 283 LDD -96(%sp),%r7 ; restore r7 284 LDD -104(%sp),%r6 ; restore r6 285 LDD -112(%sp),%r5 ; restore r5 286 LDD -120(%sp),%r4 ; restore r4 287 BVE (%rp) 288 LDD,MB -128(%sp),%r3 ; restore r3 289 .PROCEND ;in=23,24,25,26,29;out=28; 290 291 ;---------------------------------------------------------------------------- 292 ; 293 ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 294 ; 295 ; arg0 = rp 296 ; arg1 = ap 297 ; arg3 = num 298 ; w on stack at -56(sp) 299 300 bn_mul_words 301 .proc 302 .callinfo frame=128 303 .entry 304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 305 .align 64 306 307 STD %r3,0(%sp) ; save r3 308 STD %r4,8(%sp) ; save r4 309 NOP 310 STD %r5,16(%sp) ; save r5 311 312 STD %r6,24(%sp) ; save r6 313 STD %r7,32(%sp) ; save r7 314 COPY %r0,%ret1 ; return 0 by default 315 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 316 317 CMPIB,>= 0,num,bn_mul_words_exit 318 LDO 128(%sp),%sp ; bump stack 319 320 ; 321 ; See if only 1 word to do, thus just do cleanup 322 ; 323 CMPIB,= 1,num,bn_mul_words_single_top 324 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) 325 326 ; 327 ; This loop is unrolled 2 times (64-byte aligned as well) 328 ; 329 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus 330 ; two 32-bit mutiplies can be issued per cycle. 331 ; 332 bn_mul_words_unroll2 333 334 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 335 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) 336 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l 337 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l 338 339 FSTD fm1,-16(%sp) ; -16(sp) = m1 340 FSTD fm1_1,-48(%sp) ; -48(sp) = m1 341 XMPYU flt_0,fw_h,fm ; m = lt*fw_h 342 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h 343 344 FSTD fm,-8(%sp) ; -8(sp) = m 345 FSTD fm_1,-40(%sp) ; -40(sp) = m 346 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h 347 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h 348 349 FSTD ht_temp,-24(%sp) ; -24(sp) = ht 350 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht 351 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 352 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l 353 354 FSTD lt_temp,-32(%sp) ; -32(sp) = lt 355 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt 356 LDD -8(%sp),m_0 357 LDD -40(%sp),m_1 358 359 LDD -16(%sp),m1_0 360 LDD -48(%sp),m1_1 361 LDD -24(%sp),ht_0 362 LDD -56(%sp),ht_1 363 364 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; 365 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; 366 LDD -32(%sp),lt_0 367 LDD -64(%sp),lt_1 368 369 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) 370 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 371 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) 372 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) 373 374 EXTRD,U tmp_0,31,32,m_0 ; m>>32 375 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 376 EXTRD,U tmp_1,31,32,m_1 ; m>>32 377 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 378 379 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 380 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) 381 ADD lt_0,m1_0,lt_0 ; lt = lt+m1; 382 ADD,DC ht_0,%r0,ht_0 ; ht++ 383 384 ADD lt_1,m1_1,lt_1 ; lt = lt+m1; 385 ADD,DC ht_1,%r0,ht_1 ; ht++ 386 ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); 387 ADD,DC ht_0,%r0,ht_0 ; ht++ 388 389 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) 390 ADD,DC ht_1,%r0,ht_1 ; ht++ 391 STD lt_0,0(r_ptr) ; rp[0] = lt 392 STD lt_1,8(r_ptr) ; rp[1] = lt 393 394 COPY ht_1,%ret1 ; carry = ht 395 LDO -2(num),num ; num = num - 2; 396 LDO 16(a_ptr),a_ptr ; ap += 2 397 CMPIB,<= 2,num,bn_mul_words_unroll2 398 LDO 16(r_ptr),r_ptr ; rp++ 399 400 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? 401 402 ; 403 ; Top of loop aligned on 64-byte boundary 404 ; 405 bn_mul_words_single_top 406 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 407 408 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l 409 FSTD fm1,-16(%sp) ; -16(sp) = m1 410 XMPYU flt_0,fw_h,fm ; m = lt*fw_h 411 FSTD fm,-8(%sp) ; -8(sp) = m 412 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h 413 FSTD ht_temp,-24(%sp) ; -24(sp) = ht 414 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 415 FSTD lt_temp,-32(%sp) ; -32(sp) = lt 416 417 LDD -8(%sp),m_0 418 LDD -16(%sp),m1_0 419 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; 420 LDD -24(%sp),ht_0 421 LDD -32(%sp),lt_0 422 423 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) 424 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 425 426 EXTRD,U tmp_0,31,32,m_0 ; m>>32 427 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 428 429 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 430 ADD lt_0,m1_0,lt_0 ; lt= lt+m1; 431 ADD,DC ht_0,%r0,ht_0 ; ht++ 432 433 ADD %ret1,lt_0,lt_0 ; lt = lt + c; 434 ADD,DC ht_0,%r0,ht_0 ; ht++ 435 436 COPY ht_0,%ret1 ; copy carry 437 STD lt_0,0(r_ptr) ; rp[0] = lt 438 439 bn_mul_words_exit 440 .EXIT 441 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 442 LDD -96(%sp),%r7 ; restore r7 443 LDD -104(%sp),%r6 ; restore r6 444 LDD -112(%sp),%r5 ; restore r5 445 LDD -120(%sp),%r4 ; restore r4 446 BVE (%rp) 447 LDD,MB -128(%sp),%r3 ; restore r3 448 .PROCEND 449 450 ;---------------------------------------------------------------------------- 451 ; 452 ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) 453 ; 454 ; arg0 = rp 455 ; arg1 = ap 456 ; arg2 = num 457 ; 458 459 bn_sqr_words 460 .proc 461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 463 .entry 464 .align 64 465 466 STD %r3,0(%sp) ; save r3 467 STD %r4,8(%sp) ; save r4 468 NOP 469 STD %r5,16(%sp) ; save r5 470 471 CMPIB,>= 0,num,bn_sqr_words_exit 472 LDO 128(%sp),%sp ; bump stack 473 474 ; 475 ; If only 1, the goto straight to cleanup 476 ; 477 CMPIB,= 1,num,bn_sqr_words_single_top 478 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 479 480 ; 481 ; This loop is unrolled 2 times (64-byte aligned as well) 482 ; 483 484 bn_sqr_words_unroll2 485 FLDD 0(a_ptr),t_float_0 ; a[0] 486 FLDD 8(a_ptr),t_float_1 ; a[1] 487 XMPYU fht_0,flt_0,fm ; m[0] 488 XMPYU fht_1,flt_1,fm_1 ; m[1] 489 490 FSTD fm,-24(%sp) ; store m[0] 491 FSTD fm_1,-56(%sp) ; store m[1] 492 XMPYU flt_0,flt_0,lt_temp ; lt[0] 493 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] 494 495 FSTD lt_temp,-16(%sp) ; store lt[0] 496 FSTD lt_temp_1,-48(%sp) ; store lt[1] 497 XMPYU fht_0,fht_0,ht_temp ; ht[0] 498 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] 499 500 FSTD ht_temp,-8(%sp) ; store ht[0] 501 FSTD ht_temp_1,-40(%sp) ; store ht[1] 502 LDD -24(%sp),m_0 503 LDD -56(%sp),m_1 504 505 AND m_0,high_mask,tmp_0 ; m[0] & Mask 506 AND m_1,high_mask,tmp_1 ; m[1] & Mask 507 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 508 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 509 510 LDD -16(%sp),lt_0 511 LDD -48(%sp),lt_1 512 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 513 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 514 515 LDD -8(%sp),ht_0 516 LDD -40(%sp),ht_1 517 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 518 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 519 520 ADD lt_0,m_0,lt_0 ; lt = lt+m 521 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 522 STD lt_0,0(r_ptr) ; rp[0] = lt[0] 523 STD ht_0,8(r_ptr) ; rp[1] = ht[1] 524 525 ADD lt_1,m_1,lt_1 ; lt = lt+m 526 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 527 STD lt_1,16(r_ptr) ; rp[2] = lt[1] 528 STD ht_1,24(r_ptr) ; rp[3] = ht[1] 529 530 LDO -2(num),num ; num = num - 2; 531 LDO 16(a_ptr),a_ptr ; ap += 2 532 CMPIB,<= 2,num,bn_sqr_words_unroll2 533 LDO 32(r_ptr),r_ptr ; rp += 4 534 535 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? 536 537 ; 538 ; Top of loop aligned on 64-byte boundary 539 ; 540 bn_sqr_words_single_top 541 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 542 543 XMPYU fht_0,flt_0,fm ; m 544 FSTD fm,-24(%sp) ; store m 545 546 XMPYU flt_0,flt_0,lt_temp ; lt 547 FSTD lt_temp,-16(%sp) ; store lt 548 549 XMPYU fht_0,fht_0,ht_temp ; ht 550 FSTD ht_temp,-8(%sp) ; store ht 551 552 LDD -24(%sp),m_0 ; load m 553 AND m_0,high_mask,tmp_0 ; m & Mask 554 DEPD,Z m_0,30,31,m_0 ; m << 32+1 555 LDD -16(%sp),lt_0 ; lt 556 557 LDD -8(%sp),ht_0 ; ht 558 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 559 ADD m_0,lt_0,lt_0 ; lt = lt+m 560 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 561 ADD,DC ht_0,%r0,ht_0 ; ht++ 562 563 STD lt_0,0(r_ptr) ; rp[0] = lt 564 STD ht_0,8(r_ptr) ; rp[1] = ht 565 566 bn_sqr_words_exit 567 .EXIT 568 LDD -112(%sp),%r5 ; restore r5 569 LDD -120(%sp),%r4 ; restore r4 570 BVE (%rp) 571 LDD,MB -128(%sp),%r3 572 .PROCEND ;in=23,24,25,26,29;out=28; 573 574 575 ;---------------------------------------------------------------------------- 576 ; 577 ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 578 ; 579 ; arg0 = rp 580 ; arg1 = ap 581 ; arg2 = bp 582 ; arg3 = n 583 584 t .reg %r22 585 b .reg %r21 586 l .reg %r20 587 588 bn_add_words 589 .proc 590 .entry 591 .callinfo 592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 593 .align 64 594 595 CMPIB,>= 0,n,bn_add_words_exit 596 COPY %r0,%ret1 ; return 0 by default 597 598 ; 599 ; If 2 or more numbers do the loop 600 ; 601 CMPIB,= 1,n,bn_add_words_single_top 602 NOP 603 604 ; 605 ; This loop is unrolled 2 times (64-byte aligned as well) 606 ; 607 bn_add_words_unroll2 608 LDD 0(a_ptr),t 609 LDD 0(b_ptr),b 610 ADD t,%ret1,t ; t = t+c; 611 ADD,DC %r0,%r0,%ret1 ; set c to carry 612 ADD t,b,l ; l = t + b[0] 613 ADD,DC %ret1,%r0,%ret1 ; c+= carry 614 STD l,0(r_ptr) 615 616 LDD 8(a_ptr),t 617 LDD 8(b_ptr),b 618 ADD t,%ret1,t ; t = t+c; 619 ADD,DC %r0,%r0,%ret1 ; set c to carry 620 ADD t,b,l ; l = t + b[0] 621 ADD,DC %ret1,%r0,%ret1 ; c+= carry 622 STD l,8(r_ptr) 623 624 LDO -2(n),n 625 LDO 16(a_ptr),a_ptr 626 LDO 16(b_ptr),b_ptr 627 628 CMPIB,<= 2,n,bn_add_words_unroll2 629 LDO 16(r_ptr),r_ptr 630 631 CMPIB,=,N 0,n,bn_add_words_exit ; are we done? 632 633 bn_add_words_single_top 634 LDD 0(a_ptr),t 635 LDD 0(b_ptr),b 636 637 ADD t,%ret1,t ; t = t+c; 638 ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) 639 ADD t,b,l ; l = t + b[0] 640 ADD,DC %ret1,%r0,%ret1 ; c+= carry 641 STD l,0(r_ptr) 642 643 bn_add_words_exit 644 .EXIT 645 BVE (%rp) 646 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 647 .PROCEND ;in=23,24,25,26,29;out=28; 648 649 ;---------------------------------------------------------------------------- 650 ; 651 ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 652 ; 653 ; arg0 = rp 654 ; arg1 = ap 655 ; arg2 = bp 656 ; arg3 = n 657 658 t1 .reg %r22 659 t2 .reg %r21 660 sub_tmp1 .reg %r20 661 sub_tmp2 .reg %r19 662 663 664 bn_sub_words 665 .proc 666 .callinfo 667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 668 .entry 669 .align 64 670 671 CMPIB,>= 0,n,bn_sub_words_exit 672 COPY %r0,%ret1 ; return 0 by default 673 674 ; 675 ; If 2 or more numbers do the loop 676 ; 677 CMPIB,= 1,n,bn_sub_words_single_top 678 NOP 679 680 ; 681 ; This loop is unrolled 2 times (64-byte aligned as well) 682 ; 683 bn_sub_words_unroll2 684 LDD 0(a_ptr),t1 685 LDD 0(b_ptr),t2 686 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 687 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; 688 689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 690 LDO 1(%r0),sub_tmp2 691 692 CMPCLR,*= t1,t2,%r0 693 COPY sub_tmp2,%ret1 694 STD sub_tmp1,0(r_ptr) 695 696 LDD 8(a_ptr),t1 697 LDD 8(b_ptr),t2 698 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 699 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; 700 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 701 LDO 1(%r0),sub_tmp2 702 703 CMPCLR,*= t1,t2,%r0 704 COPY sub_tmp2,%ret1 705 STD sub_tmp1,8(r_ptr) 706 707 LDO -2(n),n 708 LDO 16(a_ptr),a_ptr 709 LDO 16(b_ptr),b_ptr 710 711 CMPIB,<= 2,n,bn_sub_words_unroll2 712 LDO 16(r_ptr),r_ptr 713 714 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? 715 716 bn_sub_words_single_top 717 LDD 0(a_ptr),t1 718 LDD 0(b_ptr),t2 719 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 720 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; 721 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 722 LDO 1(%r0),sub_tmp2 723 724 CMPCLR,*= t1,t2,%r0 725 COPY sub_tmp2,%ret1 726 727 STD sub_tmp1,0(r_ptr) 728 729 bn_sub_words_exit 730 .EXIT 731 BVE (%rp) 732 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 733 .PROCEND ;in=23,24,25,26,29;out=28; 734 735 ;------------------------------------------------------------------------------ 736 ; 737 ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) 738 ; 739 ; arg0 = h 740 ; arg1 = l 741 ; arg2 = d 742 ; 743 ; This is mainly just output from the HP C compiler. 744 ; 745 ;------------------------------------------------------------------------------ 746 bn_div_words 747 .PROC 748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN 749 .IMPORT BN_num_bits_word,CODE 750 ;--- not PIC .IMPORT __iob,DATA 751 ;--- not PIC .IMPORT fprintf,CODE 752 .IMPORT abort,CODE 753 .IMPORT $$div2U,MILLICODE 754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE 755 .ENTRY 756 STW %r2,-20(%r30) ;offset 0x8ec 757 STW,MA %r3,192(%r30) ;offset 0x8f0 758 STW %r4,-188(%r30) ;offset 0x8f4 759 DEPD %r5,31,32,%r6 ;offset 0x8f8 760 STD %r6,-184(%r30) ;offset 0x8fc 761 DEPD %r7,31,32,%r8 ;offset 0x900 762 STD %r8,-176(%r30) ;offset 0x904 763 STW %r9,-168(%r30) ;offset 0x908 764 LDD -248(%r30),%r3 ;offset 0x90c 765 COPY %r26,%r4 ;offset 0x910 766 COPY %r24,%r5 ;offset 0x914 767 DEPD %r25,31,32,%r4 ;offset 0x918 768 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c 769 DEPD %r23,31,32,%r5 ;offset 0x920 770 MOVIB,TR -1,%r29,$00060002 ;offset 0x924 771 EXTRD,U %r29,31,32,%r28 ;offset 0x928 772 $0006002A 773 LDO -1(%r29),%r29 ;offset 0x92c 774 SUB %r23,%r7,%r23 ;offset 0x930 775 $00060024 776 SUB %r4,%r31,%r25 ;offset 0x934 777 AND %r25,%r19,%r26 ;offset 0x938 778 CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c 779 DEPD,Z %r25,31,32,%r20 ;offset 0x940 780 OR %r20,%r24,%r21 ;offset 0x944 781 CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 782 SUB %r31,%r2,%r31 ;offset 0x94c 783 $00060046 784 $0006002E 785 DEPD,Z %r23,31,32,%r25 ;offset 0x950 786 EXTRD,U %r23,31,32,%r26 ;offset 0x954 787 AND %r25,%r19,%r24 ;offset 0x958 788 ADD,L %r31,%r26,%r31 ;offset 0x95c 789 CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 790 LDO 1(%r31),%r31 ;offset 0x964 791 $00060032 792 CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 793 LDO -1(%r29),%r29 ;offset 0x96c 794 ADD,L %r4,%r3,%r4 ;offset 0x970 795 $00060036 796 ADDIB,=,N -1,%r8,$D0 ;offset 0x974 797 SUB %r5,%r24,%r28 ;offset 0x978 798 $0006003A 799 SUB %r4,%r31,%r24 ;offset 0x97c 800 SHRPD %r24,%r28,32,%r4 ;offset 0x980 801 DEPD,Z %r29,31,32,%r9 ;offset 0x984 802 DEPD,Z %r28,31,32,%r5 ;offset 0x988 803 $0006001C 804 EXTRD,U %r4,31,32,%r31 ;offset 0x98c 805 CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 806 MOVB,TR %r6,%r29,$D1 ;offset 0x994 807 STD %r29,-152(%r30) ;offset 0x998 808 $0006000C 809 EXTRD,U %r3,31,32,%r25 ;offset 0x99c 810 COPY %r3,%r26 ;offset 0x9a0 811 EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 812 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; 814 B,L BN_num_bits_word,%r2 ;offset 0x9ac 815 EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 816 LDI 64,%r20 ;offset 0x9b4 817 DEPD %r7,31,32,%r5 ;offset 0x9b8 818 DEPD %r8,31,32,%r4 ;offset 0x9bc 819 DEPD %r9,31,32,%r3 ;offset 0x9c0 820 CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 821 COPY %r28,%r24 ;offset 0x9c8 822 MTSARCM %r24 ;offset 0x9cc 823 DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 824 CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 825 $00060012 826 SUBI 64,%r24,%r31 ;offset 0x9d8 827 CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc 828 SUB %r4,%r3,%r4 ;offset 0x9e0 829 $00060016 830 CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 831 COPY %r0,%r9 ;offset 0x9e8 832 MTSARCM %r31 ;offset 0x9ec 833 DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 834 SUBI 64,%r31,%r26 ;offset 0x9f4 835 MTSAR %r26 ;offset 0x9f8 836 SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc 837 MTSARCM %r31 ;offset 0xa00 838 DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 839 $0006001A 840 DEPDI,Z -1,31,32,%r19 ;offset 0xa08 841 AND %r3,%r19,%r29 ;offset 0xa0c 842 EXTRD,U %r29,31,32,%r2 ;offset 0xa10 843 DEPDI,Z -1,63,32,%r6 ;offset 0xa14 844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c 846 $D2 847 ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 848 ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 849 ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; 851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c 852 ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 853 .CALL ; 854 B,L abort,%r2 ;offset 0xa34 855 NOP ;offset 0xa38 856 B $D3 ;offset 0xa3c 857 LDW -212(%r30),%r2 ;offset 0xa40 858 $00060020 859 COPY %r4,%r26 ;offset 0xa44 860 EXTRD,U %r4,31,32,%r25 ;offset 0xa48 861 COPY %r2,%r24 ;offset 0xa4c 862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) 863 B,L $$div2U,%r31 ;offset 0xa50 864 EXTRD,U %r2,31,32,%r23 ;offset 0xa54 865 DEPD %r28,31,32,%r29 ;offset 0xa58 866 $00060022 867 STD %r29,-152(%r30) ;offset 0xa5c 868 $D1 869 AND %r5,%r19,%r24 ;offset 0xa60 870 EXTRD,U %r24,31,32,%r24 ;offset 0xa64 871 STW %r2,-160(%r30) ;offset 0xa68 872 STW %r7,-128(%r30) ;offset 0xa6c 873 FLDD -152(%r30),%fr4 ;offset 0xa70 874 FLDD -152(%r30),%fr7 ;offset 0xa74 875 FLDW -160(%r30),%fr8L ;offset 0xa78 876 FLDW -128(%r30),%fr5L ;offset 0xa7c 877 XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 878 FSTD %fr10,-136(%r30) ;offset 0xa84 879 XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 880 FSTD %fr22,-144(%r30) ;offset 0xa8c 881 XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 882 XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 883 FSTD %fr11,-112(%r30) ;offset 0xa98 884 FSTD %fr23,-120(%r30) ;offset 0xa9c 885 LDD -136(%r30),%r28 ;offset 0xaa0 886 DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 887 LDD -144(%r30),%r20 ;offset 0xaa8 888 ADD,L %r20,%r31,%r31 ;offset 0xaac 889 LDD -112(%r30),%r22 ;offset 0xab0 890 DEPD,Z %r22,31,32,%r22 ;offset 0xab4 891 LDD -120(%r30),%r21 ;offset 0xab8 892 B $00060024 ;offset 0xabc 893 ADD,L %r21,%r22,%r23 ;offset 0xac0 894 $D0 895 OR %r9,%r29,%r29 ;offset 0xac4 896 $00060040 897 EXTRD,U %r29,31,32,%r28 ;offset 0xac8 898 $00060002 899 $L2 900 LDW -212(%r30),%r2 ;offset 0xacc 901 $D3 902 LDW -168(%r30),%r9 ;offset 0xad0 903 LDD -176(%r30),%r8 ;offset 0xad4 904 EXTRD,U %r8,31,32,%r7 ;offset 0xad8 905 LDD -184(%r30),%r6 ;offset 0xadc 906 EXTRD,U %r6,31,32,%r5 ;offset 0xae0 907 LDW -188(%r30),%r4 ;offset 0xae4 908 BVE (%r2) ;offset 0xae8 909 .EXIT 910 LDW,MB -192(%r30),%r3 ;offset 0xaec 911 .PROCEND ;in=23,25;out=28,29;fpin=105,107; 912 913 914 915 916 ;---------------------------------------------------------------------------- 917 ; 918 ; Registers to hold 64-bit values to manipulate. The "L" part 919 ; of the register corresponds to the upper 32-bits, while the "R" 920 ; part corresponds to the lower 32-bits 921 ; 922 ; Note, that when using b6 and b7, the code must save these before 923 ; using them because they are callee save registers 924 ; 925 ; 926 ; Floating point registers to use to save values that 927 ; are manipulated. These don't collide with ftemp1-6 and 928 ; are all caller save registers 929 ; 930 a0 .reg %fr22 931 a0L .reg %fr22L 932 a0R .reg %fr22R 933 934 a1 .reg %fr23 935 a1L .reg %fr23L 936 a1R .reg %fr23R 937 938 a2 .reg %fr24 939 a2L .reg %fr24L 940 a2R .reg %fr24R 941 942 a3 .reg %fr25 943 a3L .reg %fr25L 944 a3R .reg %fr25R 945 946 a4 .reg %fr26 947 a4L .reg %fr26L 948 a4R .reg %fr26R 949 950 a5 .reg %fr27 951 a5L .reg %fr27L 952 a5R .reg %fr27R 953 954 a6 .reg %fr28 955 a6L .reg %fr28L 956 a6R .reg %fr28R 957 958 a7 .reg %fr29 959 a7L .reg %fr29L 960 a7R .reg %fr29R 961 962 b0 .reg %fr30 963 b0L .reg %fr30L 964 b0R .reg %fr30R 965 966 b1 .reg %fr31 967 b1L .reg %fr31L 968 b1R .reg %fr31R 969 970 ; 971 ; Temporary floating point variables, these are all caller save 972 ; registers 973 ; 974 ftemp1 .reg %fr4 975 ftemp2 .reg %fr5 976 ftemp3 .reg %fr6 977 ftemp4 .reg %fr7 978 979 ; 980 ; The B set of registers when used. 981 ; 982 983 b2 .reg %fr8 984 b2L .reg %fr8L 985 b2R .reg %fr8R 986 987 b3 .reg %fr9 988 b3L .reg %fr9L 989 b3R .reg %fr9R 990 991 b4 .reg %fr10 992 b4L .reg %fr10L 993 b4R .reg %fr10R 994 995 b5 .reg %fr11 996 b5L .reg %fr11L 997 b5R .reg %fr11R 998 999 b6 .reg %fr12 1000 b6L .reg %fr12L 1001 b6R .reg %fr12R 1002 1003 b7 .reg %fr13 1004 b7L .reg %fr13L 1005 b7R .reg %fr13R 1006 1007 c1 .reg %r21 ; only reg 1008 temp1 .reg %r20 ; only reg 1009 temp2 .reg %r19 ; only reg 1010 temp3 .reg %r31 ; only reg 1011 1012 m1 .reg %r28 1013 c2 .reg %r23 1014 high_one .reg %r1 1015 ht .reg %r6 1016 lt .reg %r5 1017 m .reg %r4 1018 c3 .reg %r3 1019 1020 SQR_ADD_C .macro A0L,A0R,C1,C2,C3 1021 XMPYU A0L,A0R,ftemp1 ; m 1022 FSTD ftemp1,-24(%sp) ; store m 1023 1024 XMPYU A0R,A0R,ftemp2 ; lt 1025 FSTD ftemp2,-16(%sp) ; store lt 1026 1027 XMPYU A0L,A0L,ftemp3 ; ht 1028 FSTD ftemp3,-8(%sp) ; store ht 1029 1030 LDD -24(%sp),m ; load m 1031 AND m,high_mask,temp2 ; m & Mask 1032 DEPD,Z m,30,31,temp3 ; m << 32+1 1033 LDD -16(%sp),lt ; lt 1034 1035 LDD -8(%sp),ht ; ht 1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 1037 ADD temp3,lt,lt ; lt = lt+m 1038 ADD,L ht,temp1,ht ; ht += temp1 1039 ADD,DC ht,%r0,ht ; ht++ 1040 1041 ADD C1,lt,C1 ; c1=c1+lt 1042 ADD,DC ht,%r0,ht ; ht++ 1043 1044 ADD C2,ht,C2 ; c2=c2+ht 1045 ADD,DC C3,%r0,C3 ; c3++ 1046 .endm 1047 1048 SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht 1050 FSTD ftemp1,-16(%sp) ; 1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt 1052 FSTD ftemp2,-8(%sp) ; 1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt 1054 FSTD ftemp3,-32(%sp) 1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht 1056 FSTD ftemp4,-24(%sp) ; 1057 1058 LDD -8(%sp),m ; r21 = m 1059 LDD -16(%sp),m1 ; r19 = m1 1060 ADD,L m,m1,m ; m+m1 1061 1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32) 1063 LDD -24(%sp),ht ; r24 = ht 1064 1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) 1066 ADD,L ht,high_one,ht ; ht+=high_one 1067 1068 EXTRD,U m,31,32,temp1 ; m >> 32 1069 LDD -32(%sp),lt ; lt 1070 ADD,L ht,temp1,ht ; ht+= m>>32 1071 ADD lt,temp3,lt ; lt = lt+m1 1072 ADD,DC ht,%r0,ht ; ht++ 1073 1074 ADD ht,ht,ht ; ht=ht+ht; 1075 ADD,DC C3,%r0,C3 ; add in carry (c3++) 1076 1077 ADD lt,lt,lt ; lt=lt+lt; 1078 ADD,DC ht,%r0,ht ; add in carry (ht++) 1079 1080 ADD C1,lt,C1 ; c1=c1+lt 1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) 1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise 1083 1084 ADD C2,ht,C2 ; c2 = c2 + ht 1085 ADD,DC C3,%r0,C3 ; add in carry (c3++) 1086 .endm 1087 1088 ; 1089 ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 1090 ; arg0 = r_ptr 1091 ; arg1 = a_ptr 1092 ; 1093 1094 bn_sqr_comba8 1095 .PROC 1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1098 .ENTRY 1099 .align 64 1100 1101 STD %r3,0(%sp) ; save r3 1102 STD %r4,8(%sp) ; save r4 1103 STD %r5,16(%sp) ; save r5 1104 STD %r6,24(%sp) ; save r6 1105 1106 ; 1107 ; Zero out carries 1108 ; 1109 COPY %r0,c1 1110 COPY %r0,c2 1111 COPY %r0,c3 1112 1113 LDO 128(%sp),%sp ; bump stack 1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1116 1117 ; 1118 ; Load up all of the values we are going to use 1119 ; 1120 FLDD 0(a_ptr),a0 1121 FLDD 8(a_ptr),a1 1122 FLDD 16(a_ptr),a2 1123 FLDD 24(a_ptr),a3 1124 FLDD 32(a_ptr),a4 1125 FLDD 40(a_ptr),a5 1126 FLDD 48(a_ptr),a6 1127 FLDD 56(a_ptr),a7 1128 1129 SQR_ADD_C a0L,a0R,c1,c2,c3 1130 STD c1,0(r_ptr) ; r[0] = c1; 1131 COPY %r0,c1 1132 1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 1134 STD c2,8(r_ptr) ; r[1] = c2; 1135 COPY %r0,c2 1136 1137 SQR_ADD_C a1L,a1R,c3,c1,c2 1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 1139 STD c3,16(r_ptr) ; r[2] = c3; 1140 COPY %r0,c3 1141 1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 1144 STD c1,24(r_ptr) ; r[3] = c1; 1145 COPY %r0,c1 1146 1147 SQR_ADD_C a2L,a2R,c2,c3,c1 1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 1150 STD c2,32(r_ptr) ; r[4] = c2; 1151 COPY %r0,c2 1152 1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 1156 STD c3,40(r_ptr) ; r[5] = c3; 1157 COPY %r0,c3 1158 1159 SQR_ADD_C a3L,a3R,c1,c2,c3 1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 1163 STD c1,48(r_ptr) ; r[6] = c1; 1164 COPY %r0,c1 1165 1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 1170 STD c2,56(r_ptr) ; r[7] = c2; 1171 COPY %r0,c2 1172 1173 SQR_ADD_C a4L,a4R,c3,c1,c2 1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 1177 STD c3,64(r_ptr) ; r[8] = c3; 1178 COPY %r0,c3 1179 1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 1183 STD c1,72(r_ptr) ; r[9] = c1; 1184 COPY %r0,c1 1185 1186 SQR_ADD_C a5L,a5R,c2,c3,c1 1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 1189 STD c2,80(r_ptr) ; r[10] = c2; 1190 COPY %r0,c2 1191 1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 1194 STD c3,88(r_ptr) ; r[11] = c3; 1195 COPY %r0,c3 1196 1197 SQR_ADD_C a6L,a6R,c1,c2,c3 1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 1199 STD c1,96(r_ptr) ; r[12] = c1; 1200 COPY %r0,c1 1201 1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 1203 STD c2,104(r_ptr) ; r[13] = c2; 1204 COPY %r0,c2 1205 1206 SQR_ADD_C a7L,a7R,c3,c1,c2 1207 STD c3, 112(r_ptr) ; r[14] = c3 1208 STD c1, 120(r_ptr) ; r[15] = c1 1209 1210 .EXIT 1211 LDD -104(%sp),%r6 ; restore r6 1212 LDD -112(%sp),%r5 ; restore r5 1213 LDD -120(%sp),%r4 ; restore r4 1214 BVE (%rp) 1215 LDD,MB -128(%sp),%r3 1216 1217 .PROCEND 1218 1219 ;----------------------------------------------------------------------------- 1220 ; 1221 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 1222 ; arg0 = r_ptr 1223 ; arg1 = a_ptr 1224 ; 1225 1226 bn_sqr_comba4 1227 .proc 1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1230 .entry 1231 .align 64 1232 STD %r3,0(%sp) ; save r3 1233 STD %r4,8(%sp) ; save r4 1234 STD %r5,16(%sp) ; save r5 1235 STD %r6,24(%sp) ; save r6 1236 1237 ; 1238 ; Zero out carries 1239 ; 1240 COPY %r0,c1 1241 COPY %r0,c2 1242 COPY %r0,c3 1243 1244 LDO 128(%sp),%sp ; bump stack 1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1247 1248 ; 1249 ; Load up all of the values we are going to use 1250 ; 1251 FLDD 0(a_ptr),a0 1252 FLDD 8(a_ptr),a1 1253 FLDD 16(a_ptr),a2 1254 FLDD 24(a_ptr),a3 1255 FLDD 32(a_ptr),a4 1256 FLDD 40(a_ptr),a5 1257 FLDD 48(a_ptr),a6 1258 FLDD 56(a_ptr),a7 1259 1260 SQR_ADD_C a0L,a0R,c1,c2,c3 1261 1262 STD c1,0(r_ptr) ; r[0] = c1; 1263 COPY %r0,c1 1264 1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 1266 1267 STD c2,8(r_ptr) ; r[1] = c2; 1268 COPY %r0,c2 1269 1270 SQR_ADD_C a1L,a1R,c3,c1,c2 1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 1272 1273 STD c3,16(r_ptr) ; r[2] = c3; 1274 COPY %r0,c3 1275 1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 1278 1279 STD c1,24(r_ptr) ; r[3] = c1; 1280 COPY %r0,c1 1281 1282 SQR_ADD_C a2L,a2R,c2,c3,c1 1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 1284 1285 STD c2,32(r_ptr) ; r[4] = c2; 1286 COPY %r0,c2 1287 1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 1289 STD c3,40(r_ptr) ; r[5] = c3; 1290 COPY %r0,c3 1291 1292 SQR_ADD_C a3L,a3R,c1,c2,c3 1293 STD c1,48(r_ptr) ; r[6] = c1; 1294 STD c2,56(r_ptr) ; r[7] = c2; 1295 1296 .EXIT 1297 LDD -104(%sp),%r6 ; restore r6 1298 LDD -112(%sp),%r5 ; restore r5 1299 LDD -120(%sp),%r4 ; restore r4 1300 BVE (%rp) 1301 LDD,MB -128(%sp),%r3 1302 1303 .PROCEND 1304 1305 1306 ;--------------------------------------------------------------------------- 1307 1308 MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht 1310 FSTD ftemp1,-16(%sp) ; 1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt 1312 FSTD ftemp2,-8(%sp) ; 1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt 1314 FSTD ftemp3,-32(%sp) 1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht 1316 FSTD ftemp4,-24(%sp) ; 1317 1318 LDD -8(%sp),m ; r21 = m 1319 LDD -16(%sp),m1 ; r19 = m1 1320 ADD,L m,m1,m ; m+m1 1321 1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32) 1323 LDD -24(%sp),ht ; r24 = ht 1324 1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) 1326 ADD,L ht,high_one,ht ; ht+=high_one 1327 1328 EXTRD,U m,31,32,temp1 ; m >> 32 1329 LDD -32(%sp),lt ; lt 1330 ADD,L ht,temp1,ht ; ht+= m>>32 1331 ADD lt,temp3,lt ; lt = lt+m1 1332 ADD,DC ht,%r0,ht ; ht++ 1333 1334 ADD C1,lt,C1 ; c1=c1+lt 1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise 1336 1337 ADD C2,ht,C2 ; c2 = c2 + ht 1338 ADD,DC C3,%r0,C3 ; add in carry (c3++) 1339 .endm 1340 1341 1342 ; 1343 ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1344 ; arg0 = r_ptr 1345 ; arg1 = a_ptr 1346 ; arg2 = b_ptr 1347 ; 1348 1349 bn_mul_comba8 1350 .proc 1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1353 .entry 1354 .align 64 1355 1356 STD %r3,0(%sp) ; save r3 1357 STD %r4,8(%sp) ; save r4 1358 STD %r5,16(%sp) ; save r5 1359 STD %r6,24(%sp) ; save r6 1360 FSTD %fr12,32(%sp) ; save r6 1361 FSTD %fr13,40(%sp) ; save r7 1362 1363 ; 1364 ; Zero out carries 1365 ; 1366 COPY %r0,c1 1367 COPY %r0,c2 1368 COPY %r0,c3 1369 1370 LDO 128(%sp),%sp ; bump stack 1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1372 1373 ; 1374 ; Load up all of the values we are going to use 1375 ; 1376 FLDD 0(a_ptr),a0 1377 FLDD 8(a_ptr),a1 1378 FLDD 16(a_ptr),a2 1379 FLDD 24(a_ptr),a3 1380 FLDD 32(a_ptr),a4 1381 FLDD 40(a_ptr),a5 1382 FLDD 48(a_ptr),a6 1383 FLDD 56(a_ptr),a7 1384 1385 FLDD 0(b_ptr),b0 1386 FLDD 8(b_ptr),b1 1387 FLDD 16(b_ptr),b2 1388 FLDD 24(b_ptr),b3 1389 FLDD 32(b_ptr),b4 1390 FLDD 40(b_ptr),b5 1391 FLDD 48(b_ptr),b6 1392 FLDD 56(b_ptr),b7 1393 1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 1395 STD c1,0(r_ptr) 1396 COPY %r0,c1 1397 1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 1400 STD c2,8(r_ptr) 1401 COPY %r0,c2 1402 1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 1406 STD c3,16(r_ptr) 1407 COPY %r0,c3 1408 1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 1413 STD c1,24(r_ptr) 1414 COPY %r0,c1 1415 1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 1421 STD c2,32(r_ptr) 1422 COPY %r0,c2 1423 1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 1430 STD c3,40(r_ptr) 1431 COPY %r0,c3 1432 1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 1440 STD c1,48(r_ptr) 1441 COPY %r0,c1 1442 1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 1451 STD c2,56(r_ptr) 1452 COPY %r0,c2 1453 1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 1461 STD c3,64(r_ptr) 1462 COPY %r0,c3 1463 1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 1470 STD c1,72(r_ptr) 1471 COPY %r0,c1 1472 1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 1478 STD c2,80(r_ptr) 1479 COPY %r0,c2 1480 1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 1485 STD c3,88(r_ptr) 1486 COPY %r0,c3 1487 1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 1491 STD c1,96(r_ptr) 1492 COPY %r0,c1 1493 1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 1496 STD c2,104(r_ptr) 1497 COPY %r0,c2 1498 1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 1500 STD c3,112(r_ptr) 1501 STD c1,120(r_ptr) 1502 1503 .EXIT 1504 FLDD -88(%sp),%fr13 1505 FLDD -96(%sp),%fr12 1506 LDD -104(%sp),%r6 ; restore r6 1507 LDD -112(%sp),%r5 ; restore r5 1508 LDD -120(%sp),%r4 ; restore r4 1509 BVE (%rp) 1510 LDD,MB -128(%sp),%r3 1511 1512 .PROCEND 1513 1514 ;----------------------------------------------------------------------------- 1515 ; 1516 ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1517 ; arg0 = r_ptr 1518 ; arg1 = a_ptr 1519 ; arg2 = b_ptr 1520 ; 1521 1522 bn_mul_comba4 1523 .proc 1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1526 .entry 1527 .align 64 1528 1529 STD %r3,0(%sp) ; save r3 1530 STD %r4,8(%sp) ; save r4 1531 STD %r5,16(%sp) ; save r5 1532 STD %r6,24(%sp) ; save r6 1533 FSTD %fr12,32(%sp) ; save r6 1534 FSTD %fr13,40(%sp) ; save r7 1535 1536 ; 1537 ; Zero out carries 1538 ; 1539 COPY %r0,c1 1540 COPY %r0,c2 1541 COPY %r0,c3 1542 1543 LDO 128(%sp),%sp ; bump stack 1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1545 1546 ; 1547 ; Load up all of the values we are going to use 1548 ; 1549 FLDD 0(a_ptr),a0 1550 FLDD 8(a_ptr),a1 1551 FLDD 16(a_ptr),a2 1552 FLDD 24(a_ptr),a3 1553 1554 FLDD 0(b_ptr),b0 1555 FLDD 8(b_ptr),b1 1556 FLDD 16(b_ptr),b2 1557 FLDD 24(b_ptr),b3 1558 1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 1560 STD c1,0(r_ptr) 1561 COPY %r0,c1 1562 1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 1565 STD c2,8(r_ptr) 1566 COPY %r0,c2 1567 1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 1571 STD c3,16(r_ptr) 1572 COPY %r0,c3 1573 1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 1578 STD c1,24(r_ptr) 1579 COPY %r0,c1 1580 1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 1584 STD c2,32(r_ptr) 1585 COPY %r0,c2 1586 1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 1589 STD c3,40(r_ptr) 1590 COPY %r0,c3 1591 1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 1593 STD c1,48(r_ptr) 1594 STD c2,56(r_ptr) 1595 1596 .EXIT 1597 FLDD -88(%sp),%fr13 1598 FLDD -96(%sp),%fr12 1599 LDD -104(%sp),%r6 ; restore r6 1600 LDD -112(%sp),%r5 ; restore r5 1601 LDD -120(%sp),%r4 ; restore r4 1602 BVE (%rp) 1603 LDD,MB -128(%sp),%r3 1604 1605 .PROCEND 1606 1607 1608 ;--- not PIC .SPACE $TEXT$ 1609 ;--- not PIC .SUBSPA $CODE$ 1610 ;--- not PIC .SPACE $PRIVATE$,SORT=16 1611 ;--- not PIC .IMPORT $global$,DATA 1612 ;--- not PIC .SPACE $TEXT$ 1613 ;--- not PIC .SUBSPA $CODE$ 1614 ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c 1615 ;--- not PIC C$7 1616 ;--- not PIC .ALIGN 8 1617 ;--- not PIC .STRINGZ "Division would overflow (%d)\n" 1618 .END 1619