Home | History | Annotate | Download | only in asm
      1 ;
      2 ; PA-RISC 2.0 implementation of bn_asm code, based on the
      3 ; 64-bit version of the code.  This code is effectively the
      4 ; same as the 64-bit version except the register model is
      5 ; slightly different given all values must be 32-bit between
      6 ; function calls.  Thus the 64-bit return values are returned
      7 ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
      8 ;
      9 ;
     10 ; This code is approximately 2x faster than the C version
     11 ; for RSA/DSA.
     12 ;
     13 ; See http://devresource.hp.com/  for more details on the PA-RISC
     14 ; architecture.  Also see the book "PA-RISC 2.0 Architecture"
     15 ; by Gerry Kane for information on the instruction set architecture.
     16 ;
     17 ; Code written by Chris Ruemmler (with some help from the HP C
     18 ; compiler).
     19 ;
     20 ; The code compiles with HP's assembler
     21 ;
     22 
     23 	.level	2.0N
     24 	.space	$TEXT$
     25 	.subspa	$CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
     26 
     27 ;
     28 ; Global Register definitions used for the routines.
     29 ;
     30 ; Some information about HP's runtime architecture for 32-bits.
     31 ;
     32 ; "Caller save" means the calling function must save the register
     33 ; if it wants the register to be preserved.
     34 ; "Callee save" means if a function uses the register, it must save
     35 ; the value before using it.
     36 ;
     37 ; For the floating point registers
     38 ;
     39 ;    "caller save" registers: fr4-fr11, fr22-fr31
     40 ;    "callee save" registers: fr12-fr21
     41 ;    "special" registers: fr0-fr3 (status and exception registers)
     42 ;
     43 ; For the integer registers
     44 ;     value zero             :  r0
     45 ;     "caller save" registers: r1,r19-r26
     46 ;     "callee save" registers: r3-r18
     47 ;     return register        :  r2  (rp)
     48 ;     return values          ; r28,r29  (ret0,ret1)
     49 ;     Stack pointer          ; r30  (sp)
     50 ;     millicode return ptr   ; r31  (also a caller save register)
     51 
     52 
     53 ;
     54 ; Arguments to the routines
     55 ;
     56 r_ptr       .reg %r26
     57 a_ptr       .reg %r25
     58 b_ptr       .reg %r24
     59 num         .reg %r24
     60 n           .reg %r23
     61 
     62 ;
     63 ; Note that the "w" argument for bn_mul_add_words and bn_mul_words
     64 ; is passed on the stack at a delta of -56 from the top of stack
     65 ; as the routine is entered.
     66 ;
     67 
     68 ;
     69 ; Globals used in some routines
     70 ;
     71 
     72 top_overflow .reg %r23
     73 high_mask    .reg %r22    ; value 0xffffffff80000000L
     74 
     75 
     76 ;------------------------------------------------------------------------------
     77 ;
     78 ; bn_mul_add_words
     79 ;
     80 ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
     81 ;								int num, BN_ULONG w)
     82 ;
     83 ; arg0 = r_ptr
     84 ; arg1 = a_ptr
     85 ; arg3 = num
     86 ; -56(sp) =  w
     87 ;
     88 ; Local register definitions
     89 ;
     90 
     91 fm1          .reg %fr22
     92 fm           .reg %fr23
     93 ht_temp      .reg %fr24
     94 ht_temp_1    .reg %fr25
     95 lt_temp      .reg %fr26
     96 lt_temp_1    .reg %fr27
     97 fm1_1        .reg %fr28
     98 fm_1         .reg %fr29
     99 
    100 fw_h         .reg %fr7L
    101 fw_l         .reg %fr7R
    102 fw           .reg %fr7
    103 
    104 fht_0        .reg %fr8L
    105 flt_0        .reg %fr8R
    106 t_float_0    .reg %fr8
    107 
    108 fht_1        .reg %fr9L
    109 flt_1        .reg %fr9R
    110 t_float_1    .reg %fr9
    111 
    112 tmp_0        .reg %r31
    113 tmp_1        .reg %r21
    114 m_0          .reg %r20
    115 m_1          .reg %r19
    116 ht_0         .reg %r1
    117 ht_1         .reg %r3
    118 lt_0         .reg %r4
    119 lt_1         .reg %r5
    120 m1_0         .reg %r6
    121 m1_1         .reg %r7
    122 rp_val       .reg %r8
    123 rp_val_1     .reg %r9
    124 
    125 bn_mul_add_words
    126 	.export	bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
    127 	.proc
    128 	.callinfo frame=128
    129     .entry
    130 	.align 64
    131 
    132     STD     %r3,0(%sp)          ; save r3
    133     STD     %r4,8(%sp)          ; save r4
    134 	NOP                         ; Needed to make the loop 16-byte aligned
    135 	NOP                         ; needed to make the loop 16-byte aligned
    136 
    137     STD     %r5,16(%sp)         ; save r5
    138 	NOP
    139     STD     %r6,24(%sp)         ; save r6
    140     STD     %r7,32(%sp)         ; save r7
    141 
    142     STD     %r8,40(%sp)         ; save r8
    143     STD     %r9,48(%sp)         ; save r9
    144     COPY    %r0,%ret1           ; return 0 by default
    145     DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
    146 
    147     CMPIB,>= 0,num,bn_mul_add_words_exit  ; if (num <= 0) then exit
    148 	LDO     128(%sp),%sp        ; bump stack
    149 
    150 	;
    151 	; The loop is unrolled twice, so if there is only 1 number
    152     ; then go straight to the cleanup code.
    153 	;
    154 	CMPIB,= 1,num,bn_mul_add_words_single_top
    155 	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
    156 
    157 	;
    158 	; This loop is unrolled 2 times (64-byte aligned as well)
    159 	;
    160 	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
    161     ; two 32-bit mutiplies can be issued per cycle.
    162     ;
    163 bn_mul_add_words_unroll2
    164 
    165     FLDD    0(a_ptr),t_float_0       ; load up 64-bit value (fr8L) ht(L)/lt(R)
    166     FLDD    8(a_ptr),t_float_1       ; load up 64-bit value (fr8L) ht(L)/lt(R)
    167     LDD     0(r_ptr),rp_val          ; rp[0]
    168     LDD     8(r_ptr),rp_val_1        ; rp[1]
    169 
    170     XMPYU   fht_0,fw_l,fm1           ; m1[0] = fht_0*fw_l
    171     XMPYU   fht_1,fw_l,fm1_1         ; m1[1] = fht_1*fw_l
    172     FSTD    fm1,-16(%sp)             ; -16(sp) = m1[0]
    173     FSTD    fm1_1,-48(%sp)           ; -48(sp) = m1[1]
    174 
    175     XMPYU   flt_0,fw_h,fm            ; m[0] = flt_0*fw_h
    176     XMPYU   flt_1,fw_h,fm_1          ; m[1] = flt_1*fw_h
    177     FSTD    fm,-8(%sp)               ; -8(sp) = m[0]
    178     FSTD    fm_1,-40(%sp)            ; -40(sp) = m[1]
    179 
    180     XMPYU   fht_0,fw_h,ht_temp       ; ht_temp   = fht_0*fw_h
    181     XMPYU   fht_1,fw_h,ht_temp_1     ; ht_temp_1 = fht_1*fw_h
    182     FSTD    ht_temp,-24(%sp)         ; -24(sp)   = ht_temp
    183     FSTD    ht_temp_1,-56(%sp)       ; -56(sp)   = ht_temp_1
    184 
    185     XMPYU   flt_0,fw_l,lt_temp       ; lt_temp = lt*fw_l
    186     XMPYU   flt_1,fw_l,lt_temp_1     ; lt_temp = lt*fw_l
    187     FSTD    lt_temp,-32(%sp)         ; -32(sp) = lt_temp
    188     FSTD    lt_temp_1,-64(%sp)       ; -64(sp) = lt_temp_1
    189 
    190     LDD     -8(%sp),m_0              ; m[0]
    191     LDD     -40(%sp),m_1             ; m[1]
    192     LDD     -16(%sp),m1_0            ; m1[0]
    193     LDD     -48(%sp),m1_1            ; m1[1]
    194 
    195     LDD     -24(%sp),ht_0            ; ht[0]
    196     LDD     -56(%sp),ht_1            ; ht[1]
    197     ADD,L   m1_0,m_0,tmp_0           ; tmp_0 = m[0] + m1[0];
    198     ADD,L   m1_1,m_1,tmp_1           ; tmp_1 = m[1] + m1[1];
    199 
    200     LDD     -32(%sp),lt_0
    201     LDD     -64(%sp),lt_1
    202     CMPCLR,*>>= tmp_0,m1_0, %r0      ; if (m[0] < m1[0])
    203     ADD,L   ht_0,top_overflow,ht_0   ; ht[0] += (1<<32)
    204 
    205     CMPCLR,*>>= tmp_1,m1_1,%r0       ; if (m[1] < m1[1])
    206     ADD,L   ht_1,top_overflow,ht_1   ; ht[1] += (1<<32)
    207     EXTRD,U tmp_0,31,32,m_0          ; m[0]>>32
    208     DEPD,Z  tmp_0,31,32,m1_0         ; m1[0] = m[0]<<32
    209 
    210     EXTRD,U tmp_1,31,32,m_1          ; m[1]>>32
    211     DEPD,Z  tmp_1,31,32,m1_1         ; m1[1] = m[1]<<32
    212     ADD,L   ht_0,m_0,ht_0            ; ht[0]+= (m[0]>>32)
    213     ADD,L   ht_1,m_1,ht_1            ; ht[1]+= (m[1]>>32)
    214 
    215     ADD     lt_0,m1_0,lt_0           ; lt[0] = lt[0]+m1[0];
    216 	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
    217     ADD     lt_1,m1_1,lt_1           ; lt[1] = lt[1]+m1[1];
    218     ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
    219 
    220     ADD    %ret1,lt_0,lt_0           ; lt[0] = lt[0] + c;
    221 	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
    222     ADD     lt_0,rp_val,lt_0         ; lt[0] = lt[0]+rp[0]
    223     ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
    224 
    225 	LDO    -2(num),num               ; num = num - 2;
    226     ADD     ht_0,lt_1,lt_1           ; lt[1] = lt[1] + ht_0 (c);
    227     ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
    228     STD     lt_0,0(r_ptr)            ; rp[0] = lt[0]
    229 
    230     ADD     lt_1,rp_val_1,lt_1       ; lt[1] = lt[1]+rp[1]
    231     ADD,DC  ht_1,%r0,%ret1           ; ht[1]++
    232     LDO     16(a_ptr),a_ptr          ; a_ptr += 2
    233 
    234     STD     lt_1,8(r_ptr)            ; rp[1] = lt[1]
    235 	CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
    236     LDO     16(r_ptr),r_ptr          ; r_ptr += 2
    237 
    238     CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
    239 
    240 	;
    241 	; Top of loop aligned on 64-byte boundary
    242 	;
    243 bn_mul_add_words_single_top
    244     FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
    245     LDD     0(r_ptr),rp_val           ; rp[0]
    246     LDO     8(a_ptr),a_ptr            ; a_ptr++
    247     XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
    248     FSTD    fm1,-16(%sp)              ; -16(sp) = m1
    249     XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
    250     FSTD    fm,-8(%sp)                ; -8(sp) = m
    251     XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
    252     FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
    253     XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
    254     FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
    255 
    256     LDD     -8(%sp),m_0
    257     LDD    -16(%sp),m1_0              ; m1 = temp1
    258     ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
    259     LDD     -24(%sp),ht_0
    260     LDD     -32(%sp),lt_0
    261 
    262     CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
    263     ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
    264 
    265     EXTRD,U tmp_0,31,32,m_0           ; m>>32
    266     DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
    267 
    268     ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
    269     ADD     lt_0,m1_0,tmp_0           ; tmp_0 = lt+m1;
    270     ADD,DC  ht_0,%r0,ht_0             ; ht++
    271     ADD     %ret1,tmp_0,lt_0          ; lt = lt + c;
    272     ADD,DC  ht_0,%r0,ht_0             ; ht++
    273     ADD     lt_0,rp_val,lt_0          ; lt = lt+rp[0]
    274     ADD,DC  ht_0,%r0,%ret1            ; ht++
    275     STD     lt_0,0(r_ptr)             ; rp[0] = lt
    276 
    277 bn_mul_add_words_exit
    278     .EXIT
    279 
    280     EXTRD,U %ret1,31,32,%ret0         ; for 32-bit, return in ret0/ret1
    281     LDD     -80(%sp),%r9              ; restore r9
    282     LDD     -88(%sp),%r8              ; restore r8
    283     LDD     -96(%sp),%r7              ; restore r7
    284     LDD     -104(%sp),%r6             ; restore r6
    285     LDD     -112(%sp),%r5             ; restore r5
    286     LDD     -120(%sp),%r4             ; restore r4
    287     BVE     (%rp)
    288     LDD,MB  -128(%sp),%r3             ; restore r3
    289 	.PROCEND	;in=23,24,25,26,29;out=28;
    290 
    291 ;----------------------------------------------------------------------------
    292 ;
    293 ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
    294 ;
    295 ; arg0 = rp
    296 ; arg1 = ap
    297 ; arg3 = num
    298 ; w on stack at -56(sp)
    299 
    300 bn_mul_words
    301 	.proc
    302 	.callinfo frame=128
    303     .entry
    304 	.EXPORT	bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
    305 	.align 64
    306 
    307     STD     %r3,0(%sp)          ; save r3
    308     STD     %r4,8(%sp)          ; save r4
    309 	NOP
    310     STD     %r5,16(%sp)         ; save r5
    311 
    312     STD     %r6,24(%sp)         ; save r6
    313     STD     %r7,32(%sp)         ; save r7
    314     COPY    %r0,%ret1           ; return 0 by default
    315     DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
    316 
    317     CMPIB,>= 0,num,bn_mul_words_exit
    318 	LDO     128(%sp),%sp    ; bump stack
    319 
    320 	;
    321 	; See if only 1 word to do, thus just do cleanup
    322 	;
    323 	CMPIB,= 1,num,bn_mul_words_single_top
    324 	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
    325 
    326 	;
    327 	; This loop is unrolled 2 times (64-byte aligned as well)
    328 	;
    329 	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
    330     ; two 32-bit mutiplies can be issued per cycle.
    331     ;
    332 bn_mul_words_unroll2
    333 
    334     FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
    335     FLDD    8(a_ptr),t_float_1        ; load up 64-bit value (fr8L) ht(L)/lt(R)
    336     XMPYU   fht_0,fw_l,fm1            ; m1[0] = fht_0*fw_l
    337     XMPYU   fht_1,fw_l,fm1_1          ; m1[1] = ht*fw_l
    338 
    339     FSTD    fm1,-16(%sp)              ; -16(sp) = m1
    340     FSTD    fm1_1,-48(%sp)            ; -48(sp) = m1
    341     XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
    342     XMPYU   flt_1,fw_h,fm_1           ; m = lt*fw_h
    343 
    344     FSTD    fm,-8(%sp)                ; -8(sp) = m
    345     FSTD    fm_1,-40(%sp)             ; -40(sp) = m
    346     XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = fht_0*fw_h
    347     XMPYU   fht_1,fw_h,ht_temp_1      ; ht_temp = ht*fw_h
    348 
    349     FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
    350     FSTD    ht_temp_1,-56(%sp)        ; -56(sp) = ht
    351     XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
    352     XMPYU   flt_1,fw_l,lt_temp_1      ; lt_temp = lt*fw_l
    353 
    354     FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
    355     FSTD    lt_temp_1,-64(%sp)        ; -64(sp) = lt
    356     LDD     -8(%sp),m_0
    357     LDD     -40(%sp),m_1
    358 
    359     LDD    -16(%sp),m1_0
    360     LDD    -48(%sp),m1_1
    361     LDD     -24(%sp),ht_0
    362     LDD     -56(%sp),ht_1
    363 
    364     ADD,L   m1_0,m_0,tmp_0            ; tmp_0 = m + m1;
    365     ADD,L   m1_1,m_1,tmp_1            ; tmp_1 = m + m1;
    366     LDD     -32(%sp),lt_0
    367     LDD     -64(%sp),lt_1
    368 
    369     CMPCLR,*>>= tmp_0,m1_0, %r0       ; if (m < m1)
    370     ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
    371     CMPCLR,*>>= tmp_1,m1_1,%r0        ; if (m < m1)
    372     ADD,L   ht_1,top_overflow,ht_1    ; ht += (1<<32)
    373 
    374     EXTRD,U tmp_0,31,32,m_0           ; m>>32
    375     DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
    376     EXTRD,U tmp_1,31,32,m_1           ; m>>32
    377     DEPD,Z  tmp_1,31,32,m1_1          ; m1 = m<<32
    378 
    379     ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
    380     ADD,L   ht_1,m_1,ht_1             ; ht+= (m>>32)
    381     ADD     lt_0,m1_0,lt_0            ; lt = lt+m1;
    382 	ADD,DC  ht_0,%r0,ht_0             ; ht++
    383 
    384     ADD     lt_1,m1_1,lt_1            ; lt = lt+m1;
    385     ADD,DC  ht_1,%r0,ht_1             ; ht++
    386     ADD    %ret1,lt_0,lt_0            ; lt = lt + c (ret1);
    387 	ADD,DC  ht_0,%r0,ht_0             ; ht++
    388 
    389     ADD     ht_0,lt_1,lt_1            ; lt = lt + c (ht_0)
    390     ADD,DC  ht_1,%r0,ht_1             ; ht++
    391     STD     lt_0,0(r_ptr)             ; rp[0] = lt
    392     STD     lt_1,8(r_ptr)             ; rp[1] = lt
    393 
    394 	COPY    ht_1,%ret1                ; carry = ht
    395 	LDO    -2(num),num                ; num = num - 2;
    396     LDO     16(a_ptr),a_ptr           ; ap += 2
    397 	CMPIB,<= 2,num,bn_mul_words_unroll2
    398     LDO     16(r_ptr),r_ptr           ; rp++
    399 
    400     CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
    401 
    402 	;
    403 	; Top of loop aligned on 64-byte boundary
    404 	;
    405 bn_mul_words_single_top
    406     FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
    407 
    408     XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
    409     FSTD    fm1,-16(%sp)              ; -16(sp) = m1
    410     XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
    411     FSTD    fm,-8(%sp)                ; -8(sp) = m
    412     XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
    413     FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
    414     XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
    415     FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
    416 
    417     LDD     -8(%sp),m_0
    418     LDD    -16(%sp),m1_0
    419     ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
    420     LDD     -24(%sp),ht_0
    421     LDD     -32(%sp),lt_0
    422 
    423     CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
    424     ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
    425 
    426     EXTRD,U tmp_0,31,32,m_0           ; m>>32
    427     DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
    428 
    429     ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
    430     ADD     lt_0,m1_0,lt_0            ; lt= lt+m1;
    431     ADD,DC  ht_0,%r0,ht_0             ; ht++
    432 
    433     ADD     %ret1,lt_0,lt_0           ; lt = lt + c;
    434     ADD,DC  ht_0,%r0,ht_0             ; ht++
    435 
    436     COPY    ht_0,%ret1                ; copy carry
    437     STD     lt_0,0(r_ptr)             ; rp[0] = lt
    438 
    439 bn_mul_words_exit
    440     .EXIT
    441     EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
    442     LDD     -96(%sp),%r7              ; restore r7
    443     LDD     -104(%sp),%r6             ; restore r6
    444     LDD     -112(%sp),%r5             ; restore r5
    445     LDD     -120(%sp),%r4             ; restore r4
    446     BVE     (%rp)
    447     LDD,MB  -128(%sp),%r3             ; restore r3
    448 	.PROCEND
    449 
    450 ;----------------------------------------------------------------------------
    451 ;
    452 ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
    453 ;
    454 ; arg0 = rp
    455 ; arg1 = ap
    456 ; arg2 = num
    457 ;
    458 
    459 bn_sqr_words
    460 	.proc
    461 	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
    462 	.EXPORT	bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
    463     .entry
    464 	.align 64
    465 
    466     STD     %r3,0(%sp)          ; save r3
    467     STD     %r4,8(%sp)          ; save r4
    468 	NOP
    469     STD     %r5,16(%sp)         ; save r5
    470 
    471     CMPIB,>= 0,num,bn_sqr_words_exit
    472 	LDO     128(%sp),%sp       ; bump stack
    473 
    474 	;
    475 	; If only 1, the goto straight to cleanup
    476 	;
    477 	CMPIB,= 1,num,bn_sqr_words_single_top
    478     DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
    479 
    480 	;
    481 	; This loop is unrolled 2 times (64-byte aligned as well)
    482 	;
    483 
    484 bn_sqr_words_unroll2
    485     FLDD    0(a_ptr),t_float_0        ; a[0]
    486     FLDD    8(a_ptr),t_float_1        ; a[1]
    487     XMPYU   fht_0,flt_0,fm            ; m[0]
    488     XMPYU   fht_1,flt_1,fm_1          ; m[1]
    489 
    490     FSTD    fm,-24(%sp)               ; store m[0]
    491     FSTD    fm_1,-56(%sp)             ; store m[1]
    492     XMPYU   flt_0,flt_0,lt_temp       ; lt[0]
    493     XMPYU   flt_1,flt_1,lt_temp_1     ; lt[1]
    494 
    495     FSTD    lt_temp,-16(%sp)          ; store lt[0]
    496     FSTD    lt_temp_1,-48(%sp)        ; store lt[1]
    497     XMPYU   fht_0,fht_0,ht_temp       ; ht[0]
    498     XMPYU   fht_1,fht_1,ht_temp_1     ; ht[1]
    499 
    500     FSTD    ht_temp,-8(%sp)           ; store ht[0]
    501     FSTD    ht_temp_1,-40(%sp)        ; store ht[1]
    502     LDD     -24(%sp),m_0
    503     LDD     -56(%sp),m_1
    504 
    505     AND     m_0,high_mask,tmp_0       ; m[0] & Mask
    506     AND     m_1,high_mask,tmp_1       ; m[1] & Mask
    507     DEPD,Z  m_0,30,31,m_0             ; m[0] << 32+1
    508     DEPD,Z  m_1,30,31,m_1             ; m[1] << 32+1
    509 
    510     LDD     -16(%sp),lt_0
    511     LDD     -48(%sp),lt_1
    512     EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m[0]&Mask >> 32-1
    513     EXTRD,U tmp_1,32,33,tmp_1         ; tmp_1 = m[1]&Mask >> 32-1
    514 
    515     LDD     -8(%sp),ht_0
    516     LDD     -40(%sp),ht_1
    517     ADD,L   ht_0,tmp_0,ht_0           ; ht[0] += tmp_0
    518     ADD,L   ht_1,tmp_1,ht_1           ; ht[1] += tmp_1
    519 
    520     ADD     lt_0,m_0,lt_0             ; lt = lt+m
    521     ADD,DC  ht_0,%r0,ht_0             ; ht[0]++
    522     STD     lt_0,0(r_ptr)             ; rp[0] = lt[0]
    523     STD     ht_0,8(r_ptr)             ; rp[1] = ht[1]
    524 
    525     ADD     lt_1,m_1,lt_1             ; lt = lt+m
    526     ADD,DC  ht_1,%r0,ht_1             ; ht[1]++
    527     STD     lt_1,16(r_ptr)            ; rp[2] = lt[1]
    528     STD     ht_1,24(r_ptr)            ; rp[3] = ht[1]
    529 
    530 	LDO    -2(num),num                ; num = num - 2;
    531     LDO     16(a_ptr),a_ptr           ; ap += 2
    532 	CMPIB,<= 2,num,bn_sqr_words_unroll2
    533     LDO     32(r_ptr),r_ptr           ; rp += 4
    534 
    535     CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
    536 
    537 	;
    538 	; Top of loop aligned on 64-byte boundary
    539 	;
    540 bn_sqr_words_single_top
    541     FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
    542 
    543     XMPYU   fht_0,flt_0,fm            ; m
    544     FSTD    fm,-24(%sp)               ; store m
    545 
    546     XMPYU   flt_0,flt_0,lt_temp       ; lt
    547     FSTD    lt_temp,-16(%sp)          ; store lt
    548 
    549     XMPYU   fht_0,fht_0,ht_temp       ; ht
    550     FSTD    ht_temp,-8(%sp)           ; store ht
    551 
    552     LDD     -24(%sp),m_0              ; load m
    553     AND     m_0,high_mask,tmp_0       ; m & Mask
    554     DEPD,Z  m_0,30,31,m_0             ; m << 32+1
    555     LDD     -16(%sp),lt_0             ; lt
    556 
    557     LDD     -8(%sp),ht_0              ; ht
    558     EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m&Mask >> 32-1
    559     ADD     m_0,lt_0,lt_0             ; lt = lt+m
    560     ADD,L   ht_0,tmp_0,ht_0           ; ht += tmp_0
    561     ADD,DC  ht_0,%r0,ht_0             ; ht++
    562 
    563     STD     lt_0,0(r_ptr)             ; rp[0] = lt
    564     STD     ht_0,8(r_ptr)             ; rp[1] = ht
    565 
    566 bn_sqr_words_exit
    567     .EXIT
    568     LDD     -112(%sp),%r5       ; restore r5
    569     LDD     -120(%sp),%r4       ; restore r4
    570     BVE     (%rp)
    571     LDD,MB  -128(%sp),%r3
    572 	.PROCEND	;in=23,24,25,26,29;out=28;
    573 
    574 
    575 ;----------------------------------------------------------------------------
    576 ;
    577 ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
    578 ;
    579 ; arg0 = rp
    580 ; arg1 = ap
    581 ; arg2 = bp
    582 ; arg3 = n
    583 
    584 t  .reg %r22
    585 b  .reg %r21
    586 l  .reg %r20
    587 
    588 bn_add_words
    589 	.proc
    590     .entry
    591 	.callinfo
    592 	.EXPORT	bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
    593 	.align 64
    594 
    595     CMPIB,>= 0,n,bn_add_words_exit
    596     COPY    %r0,%ret1           ; return 0 by default
    597 
    598 	;
    599 	; If 2 or more numbers do the loop
    600 	;
    601 	CMPIB,= 1,n,bn_add_words_single_top
    602 	NOP
    603 
    604 	;
    605 	; This loop is unrolled 2 times (64-byte aligned as well)
    606 	;
    607 bn_add_words_unroll2
    608 	LDD     0(a_ptr),t
    609 	LDD     0(b_ptr),b
    610 	ADD     t,%ret1,t                    ; t = t+c;
    611 	ADD,DC  %r0,%r0,%ret1                ; set c to carry
    612 	ADD     t,b,l                        ; l = t + b[0]
    613 	ADD,DC  %ret1,%r0,%ret1              ; c+= carry
    614 	STD     l,0(r_ptr)
    615 
    616 	LDD     8(a_ptr),t
    617 	LDD     8(b_ptr),b
    618 	ADD     t,%ret1,t                     ; t = t+c;
    619 	ADD,DC  %r0,%r0,%ret1                 ; set c to carry
    620 	ADD     t,b,l                         ; l = t + b[0]
    621 	ADD,DC  %ret1,%r0,%ret1               ; c+= carry
    622 	STD     l,8(r_ptr)
    623 
    624 	LDO     -2(n),n
    625 	LDO     16(a_ptr),a_ptr
    626 	LDO     16(b_ptr),b_ptr
    627 
    628 	CMPIB,<= 2,n,bn_add_words_unroll2
    629 	LDO     16(r_ptr),r_ptr
    630 
    631     CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
    632 
    633 bn_add_words_single_top
    634 	LDD     0(a_ptr),t
    635 	LDD     0(b_ptr),b
    636 
    637 	ADD     t,%ret1,t                 ; t = t+c;
    638 	ADD,DC  %r0,%r0,%ret1             ; set c to carry (could use CMPCLR??)
    639 	ADD     t,b,l                     ; l = t + b[0]
    640 	ADD,DC  %ret1,%r0,%ret1           ; c+= carry
    641 	STD     l,0(r_ptr)
    642 
    643 bn_add_words_exit
    644     .EXIT
    645     BVE     (%rp)
    646     EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
    647 	.PROCEND	;in=23,24,25,26,29;out=28;
    648 
    649 ;----------------------------------------------------------------------------
    650 ;
    651 ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
    652 ;
    653 ; arg0 = rp
    654 ; arg1 = ap
    655 ; arg2 = bp
    656 ; arg3 = n
    657 
    658 t1       .reg %r22
    659 t2       .reg %r21
    660 sub_tmp1 .reg %r20
    661 sub_tmp2 .reg %r19
    662 
    663 
    664 bn_sub_words
    665 	.proc
    666 	.callinfo
    667 	.EXPORT	bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
    668     .entry
    669 	.align 64
    670 
    671     CMPIB,>=  0,n,bn_sub_words_exit
    672     COPY    %r0,%ret1           ; return 0 by default
    673 
    674 	;
    675 	; If 2 or more numbers do the loop
    676 	;
    677 	CMPIB,= 1,n,bn_sub_words_single_top
    678 	NOP
    679 
    680 	;
    681 	; This loop is unrolled 2 times (64-byte aligned as well)
    682 	;
    683 bn_sub_words_unroll2
    684 	LDD     0(a_ptr),t1
    685 	LDD     0(b_ptr),t2
    686 	SUB     t1,t2,sub_tmp1           ; t3 = t1-t2;
    687 	SUB     sub_tmp1,%ret1,sub_tmp1  ; t3 = t3- c;
    688 
    689 	CMPCLR,*>> t1,t2,sub_tmp2        ; clear if t1 > t2
    690 	LDO      1(%r0),sub_tmp2
    691 
    692 	CMPCLR,*= t1,t2,%r0
    693 	COPY    sub_tmp2,%ret1
    694 	STD     sub_tmp1,0(r_ptr)
    695 
    696 	LDD     8(a_ptr),t1
    697 	LDD     8(b_ptr),t2
    698 	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
    699 	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c;
    700 	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
    701 	LDO      1(%r0),sub_tmp2
    702 
    703 	CMPCLR,*= t1,t2,%r0
    704 	COPY    sub_tmp2,%ret1
    705 	STD     sub_tmp1,8(r_ptr)
    706 
    707 	LDO     -2(n),n
    708 	LDO     16(a_ptr),a_ptr
    709 	LDO     16(b_ptr),b_ptr
    710 
    711 	CMPIB,<= 2,n,bn_sub_words_unroll2
    712 	LDO     16(r_ptr),r_ptr
    713 
    714     CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
    715 
    716 bn_sub_words_single_top
    717 	LDD     0(a_ptr),t1
    718 	LDD     0(b_ptr),t2
    719 	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
    720 	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c;
    721 	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
    722 	LDO      1(%r0),sub_tmp2
    723 
    724 	CMPCLR,*= t1,t2,%r0
    725 	COPY    sub_tmp2,%ret1
    726 
    727 	STD     sub_tmp1,0(r_ptr)
    728 
    729 bn_sub_words_exit
    730     .EXIT
    731     BVE     (%rp)
    732     EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
    733 	.PROCEND	;in=23,24,25,26,29;out=28;
    734 
    735 ;------------------------------------------------------------------------------
    736 ;
    737 ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
    738 ;
    739 ; arg0 = h
    740 ; arg1 = l
    741 ; arg2 = d
    742 ;
    743 ; This is mainly just output from the HP C compiler.
    744 ;
    745 ;------------------------------------------------------------------------------
    746 bn_div_words
    747 	.PROC
    748 	.EXPORT	bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
    749 	.IMPORT	BN_num_bits_word,CODE
    750 	;--- not PIC	.IMPORT	__iob,DATA
    751 	;--- not PIC	.IMPORT	fprintf,CODE
    752 	.IMPORT	abort,CODE
    753 	.IMPORT	$$div2U,MILLICODE
    754 	.CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
    755         .ENTRY
    756         STW     %r2,-20(%r30)   ;offset 0x8ec
    757         STW,MA  %r3,192(%r30)   ;offset 0x8f0
    758         STW     %r4,-188(%r30)  ;offset 0x8f4
    759         DEPD    %r5,31,32,%r6   ;offset 0x8f8
    760         STD     %r6,-184(%r30)  ;offset 0x8fc
    761         DEPD    %r7,31,32,%r8   ;offset 0x900
    762         STD     %r8,-176(%r30)  ;offset 0x904
    763         STW     %r9,-168(%r30)  ;offset 0x908
    764         LDD     -248(%r30),%r3  ;offset 0x90c
    765         COPY    %r26,%r4        ;offset 0x910
    766         COPY    %r24,%r5        ;offset 0x914
    767         DEPD    %r25,31,32,%r4  ;offset 0x918
    768         CMPB,*<>        %r3,%r0,$0006000C       ;offset 0x91c
    769         DEPD    %r23,31,32,%r5  ;offset 0x920
    770         MOVIB,TR        -1,%r29,$00060002       ;offset 0x924
    771         EXTRD,U %r29,31,32,%r28 ;offset 0x928
    772 $0006002A
    773         LDO     -1(%r29),%r29   ;offset 0x92c
    774         SUB     %r23,%r7,%r23   ;offset 0x930
    775 $00060024
    776         SUB     %r4,%r31,%r25   ;offset 0x934
    777         AND     %r25,%r19,%r26  ;offset 0x938
    778         CMPB,*<>,N      %r0,%r26,$00060046      ;offset 0x93c
    779         DEPD,Z  %r25,31,32,%r20 ;offset 0x940
    780         OR      %r20,%r24,%r21  ;offset 0x944
    781         CMPB,*<<,N      %r21,%r23,$0006002A     ;offset 0x948
    782         SUB     %r31,%r2,%r31   ;offset 0x94c
    783 $00060046
    784 $0006002E
    785         DEPD,Z  %r23,31,32,%r25 ;offset 0x950
    786         EXTRD,U %r23,31,32,%r26 ;offset 0x954
    787         AND     %r25,%r19,%r24  ;offset 0x958
    788         ADD,L   %r31,%r26,%r31  ;offset 0x95c
    789         CMPCLR,*>>=     %r5,%r24,%r0    ;offset 0x960
    790         LDO     1(%r31),%r31    ;offset 0x964
    791 $00060032
    792         CMPB,*<<=,N     %r31,%r4,$00060036      ;offset 0x968
    793         LDO     -1(%r29),%r29   ;offset 0x96c
    794         ADD,L   %r4,%r3,%r4     ;offset 0x970
    795 $00060036
    796         ADDIB,=,N       -1,%r8,$D0      ;offset 0x974
    797         SUB     %r5,%r24,%r28   ;offset 0x978
    798 $0006003A
    799         SUB     %r4,%r31,%r24   ;offset 0x97c
    800         SHRPD   %r24,%r28,32,%r4        ;offset 0x980
    801         DEPD,Z  %r29,31,32,%r9  ;offset 0x984
    802         DEPD,Z  %r28,31,32,%r5  ;offset 0x988
    803 $0006001C
    804         EXTRD,U %r4,31,32,%r31  ;offset 0x98c
    805         CMPB,*<>,N      %r31,%r2,$00060020      ;offset 0x990
    806         MOVB,TR %r6,%r29,$D1    ;offset 0x994
    807         STD     %r29,-152(%r30) ;offset 0x998
    808 $0006000C
    809         EXTRD,U %r3,31,32,%r25  ;offset 0x99c
    810         COPY    %r3,%r26        ;offset 0x9a0
    811         EXTRD,U %r3,31,32,%r9   ;offset 0x9a4
    812         EXTRD,U %r4,31,32,%r8   ;offset 0x9a8
    813         .CALL   ARGW0=GR,ARGW1=GR,RTNVAL=GR     ;in=25,26;out=28;
    814         B,L     BN_num_bits_word,%r2    ;offset 0x9ac
    815         EXTRD,U %r5,31,32,%r7   ;offset 0x9b0
    816         LDI     64,%r20 ;offset 0x9b4
    817         DEPD    %r7,31,32,%r5   ;offset 0x9b8
    818         DEPD    %r8,31,32,%r4   ;offset 0x9bc
    819         DEPD    %r9,31,32,%r3   ;offset 0x9c0
    820         CMPB,=  %r28,%r20,$00060012     ;offset 0x9c4
    821         COPY    %r28,%r24       ;offset 0x9c8
    822         MTSARCM %r24    ;offset 0x9cc
    823         DEPDI,Z -1,%sar,1,%r19  ;offset 0x9d0
    824         CMPB,*>>,N      %r4,%r19,$D2    ;offset 0x9d4
    825 $00060012
    826         SUBI    64,%r24,%r31    ;offset 0x9d8
    827         CMPCLR,*<<      %r4,%r3,%r0     ;offset 0x9dc
    828         SUB     %r4,%r3,%r4     ;offset 0x9e0
    829 $00060016
    830         CMPB,=  %r31,%r0,$0006001A      ;offset 0x9e4
    831         COPY    %r0,%r9 ;offset 0x9e8
    832         MTSARCM %r31    ;offset 0x9ec
    833         DEPD,Z  %r3,%sar,64,%r3 ;offset 0x9f0
    834         SUBI    64,%r31,%r26    ;offset 0x9f4
    835         MTSAR   %r26    ;offset 0x9f8
    836         SHRPD   %r4,%r5,%sar,%r4        ;offset 0x9fc
    837         MTSARCM %r31    ;offset 0xa00
    838         DEPD,Z  %r5,%sar,64,%r5 ;offset 0xa04
    839 $0006001A
    840         DEPDI,Z -1,31,32,%r19   ;offset 0xa08
    841         AND     %r3,%r19,%r29   ;offset 0xa0c
    842         EXTRD,U %r29,31,32,%r2  ;offset 0xa10
    843         DEPDI,Z -1,63,32,%r6    ;offset 0xa14
    844         MOVIB,TR        2,%r8,$0006001C ;offset 0xa18
    845         EXTRD,U %r3,63,32,%r7   ;offset 0xa1c
    846 $D2
    847         ;--- not PIC	ADDIL   LR'__iob-$global$,%r27,%r1      ;offset 0xa20
    848         ;--- not PIC	LDIL    LR'C$7,%r21     ;offset 0xa24
    849         ;--- not PIC	LDO     RR'__iob-$global$+32(%r1),%r26  ;offset 0xa28
    850         ;--- not PIC	.CALL   ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR    ;in=24,25,26;out=28;
    851         ;--- not PIC	B,L     fprintf,%r2     ;offset 0xa2c
    852         ;--- not PIC	LDO     RR'C$7(%r21),%r25       ;offset 0xa30
    853         .CALL           ;
    854         B,L     abort,%r2       ;offset 0xa34
    855         NOP             ;offset 0xa38
    856         B       $D3     ;offset 0xa3c
    857         LDW     -212(%r30),%r2  ;offset 0xa40
    858 $00060020
    859         COPY    %r4,%r26        ;offset 0xa44
    860         EXTRD,U %r4,31,32,%r25  ;offset 0xa48
    861         COPY    %r2,%r24        ;offset 0xa4c
    862         .CALL   ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
    863         B,L     $$div2U,%r31    ;offset 0xa50
    864         EXTRD,U %r2,31,32,%r23  ;offset 0xa54
    865         DEPD    %r28,31,32,%r29 ;offset 0xa58
    866 $00060022
    867         STD     %r29,-152(%r30) ;offset 0xa5c
    868 $D1
    869         AND     %r5,%r19,%r24   ;offset 0xa60
    870         EXTRD,U %r24,31,32,%r24 ;offset 0xa64
    871         STW     %r2,-160(%r30)  ;offset 0xa68
    872         STW     %r7,-128(%r30)  ;offset 0xa6c
    873         FLDD    -152(%r30),%fr4 ;offset 0xa70
    874         FLDD    -152(%r30),%fr7 ;offset 0xa74
    875         FLDW    -160(%r30),%fr8L        ;offset 0xa78
    876         FLDW    -128(%r30),%fr5L        ;offset 0xa7c
    877         XMPYU   %fr8L,%fr7L,%fr10       ;offset 0xa80
    878         FSTD    %fr10,-136(%r30)        ;offset 0xa84
    879         XMPYU   %fr8L,%fr7R,%fr22       ;offset 0xa88
    880         FSTD    %fr22,-144(%r30)        ;offset 0xa8c
    881         XMPYU   %fr5L,%fr4L,%fr11       ;offset 0xa90
    882         XMPYU   %fr5L,%fr4R,%fr23       ;offset 0xa94
    883         FSTD    %fr11,-112(%r30)        ;offset 0xa98
    884         FSTD    %fr23,-120(%r30)        ;offset 0xa9c
    885         LDD     -136(%r30),%r28 ;offset 0xaa0
    886         DEPD,Z  %r28,31,32,%r31 ;offset 0xaa4
    887         LDD     -144(%r30),%r20 ;offset 0xaa8
    888         ADD,L   %r20,%r31,%r31  ;offset 0xaac
    889         LDD     -112(%r30),%r22 ;offset 0xab0
    890         DEPD,Z  %r22,31,32,%r22 ;offset 0xab4
    891         LDD     -120(%r30),%r21 ;offset 0xab8
    892         B       $00060024       ;offset 0xabc
    893         ADD,L   %r21,%r22,%r23  ;offset 0xac0
    894 $D0
    895         OR      %r9,%r29,%r29   ;offset 0xac4
    896 $00060040
    897         EXTRD,U %r29,31,32,%r28 ;offset 0xac8
    898 $00060002
    899 $L2
    900         LDW     -212(%r30),%r2  ;offset 0xacc
    901 $D3
    902         LDW     -168(%r30),%r9  ;offset 0xad0
    903         LDD     -176(%r30),%r8  ;offset 0xad4
    904         EXTRD,U %r8,31,32,%r7   ;offset 0xad8
    905         LDD     -184(%r30),%r6  ;offset 0xadc
    906         EXTRD,U %r6,31,32,%r5   ;offset 0xae0
    907         LDW     -188(%r30),%r4  ;offset 0xae4
    908         BVE     (%r2)   ;offset 0xae8
    909         .EXIT
    910         LDW,MB  -192(%r30),%r3  ;offset 0xaec
    911 	.PROCEND	;in=23,25;out=28,29;fpin=105,107;
    912 
    913 
    914 
    915 
    916 ;----------------------------------------------------------------------------
    917 ;
    918 ; Registers to hold 64-bit values to manipulate.  The "L" part
    919 ; of the register corresponds to the upper 32-bits, while the "R"
    920 ; part corresponds to the lower 32-bits
    921 ;
    922 ; Note, that when using b6 and b7, the code must save these before
    923 ; using them because they are callee save registers
    924 ;
    925 ;
    926 ; Floating point registers to use to save values that
    927 ; are manipulated.  These don't collide with ftemp1-6 and
    928 ; are all caller save registers
    929 ;
    930 a0        .reg %fr22
    931 a0L       .reg %fr22L
    932 a0R       .reg %fr22R
    933 
    934 a1        .reg %fr23
    935 a1L       .reg %fr23L
    936 a1R       .reg %fr23R
    937 
    938 a2        .reg %fr24
    939 a2L       .reg %fr24L
    940 a2R       .reg %fr24R
    941 
    942 a3        .reg %fr25
    943 a3L       .reg %fr25L
    944 a3R       .reg %fr25R
    945 
    946 a4        .reg %fr26
    947 a4L       .reg %fr26L
    948 a4R       .reg %fr26R
    949 
    950 a5        .reg %fr27
    951 a5L       .reg %fr27L
    952 a5R       .reg %fr27R
    953 
    954 a6        .reg %fr28
    955 a6L       .reg %fr28L
    956 a6R       .reg %fr28R
    957 
    958 a7        .reg %fr29
    959 a7L       .reg %fr29L
    960 a7R       .reg %fr29R
    961 
    962 b0        .reg %fr30
    963 b0L       .reg %fr30L
    964 b0R       .reg %fr30R
    965 
    966 b1        .reg %fr31
    967 b1L       .reg %fr31L
    968 b1R       .reg %fr31R
    969 
    970 ;
    971 ; Temporary floating point variables, these are all caller save
    972 ; registers
    973 ;
    974 ftemp1    .reg %fr4
    975 ftemp2    .reg %fr5
    976 ftemp3    .reg %fr6
    977 ftemp4    .reg %fr7
    978 
    979 ;
    980 ; The B set of registers when used.
    981 ;
    982 
    983 b2        .reg %fr8
    984 b2L       .reg %fr8L
    985 b2R       .reg %fr8R
    986 
    987 b3        .reg %fr9
    988 b3L       .reg %fr9L
    989 b3R       .reg %fr9R
    990 
    991 b4        .reg %fr10
    992 b4L       .reg %fr10L
    993 b4R       .reg %fr10R
    994 
    995 b5        .reg %fr11
    996 b5L       .reg %fr11L
    997 b5R       .reg %fr11R
    998 
    999 b6        .reg %fr12
   1000 b6L       .reg %fr12L
   1001 b6R       .reg %fr12R
   1002 
   1003 b7        .reg %fr13
   1004 b7L       .reg %fr13L
   1005 b7R       .reg %fr13R
   1006 
   1007 c1           .reg %r21   ; only reg
   1008 temp1        .reg %r20   ; only reg
   1009 temp2        .reg %r19   ; only reg
   1010 temp3        .reg %r31   ; only reg
   1011 
   1012 m1           .reg %r28
   1013 c2           .reg %r23
   1014 high_one     .reg %r1
   1015 ht           .reg %r6
   1016 lt           .reg %r5
   1017 m            .reg %r4
   1018 c3           .reg %r3
   1019 
   1020 SQR_ADD_C  .macro  A0L,A0R,C1,C2,C3
   1021     XMPYU   A0L,A0R,ftemp1       ; m
   1022     FSTD    ftemp1,-24(%sp)      ; store m
   1023 
   1024     XMPYU   A0R,A0R,ftemp2       ; lt
   1025     FSTD    ftemp2,-16(%sp)      ; store lt
   1026 
   1027     XMPYU   A0L,A0L,ftemp3       ; ht
   1028     FSTD    ftemp3,-8(%sp)       ; store ht
   1029 
   1030     LDD     -24(%sp),m           ; load m
   1031     AND     m,high_mask,temp2    ; m & Mask
   1032     DEPD,Z  m,30,31,temp3        ; m << 32+1
   1033     LDD     -16(%sp),lt          ; lt
   1034 
   1035     LDD     -8(%sp),ht           ; ht
   1036     EXTRD,U temp2,32,33,temp1    ; temp1 = m&Mask >> 32-1
   1037     ADD     temp3,lt,lt          ; lt = lt+m
   1038     ADD,L   ht,temp1,ht          ; ht += temp1
   1039     ADD,DC  ht,%r0,ht            ; ht++
   1040 
   1041     ADD     C1,lt,C1             ; c1=c1+lt
   1042     ADD,DC  ht,%r0,ht            ; ht++
   1043 
   1044     ADD     C2,ht,C2             ; c2=c2+ht
   1045     ADD,DC  C3,%r0,C3            ; c3++
   1046 .endm
   1047 
   1048 SQR_ADD_C2 .macro  A0L,A0R,A1L,A1R,C1,C2,C3
   1049     XMPYU   A0L,A1R,ftemp1          ; m1 = bl*ht
   1050     FSTD    ftemp1,-16(%sp)         ;
   1051     XMPYU   A0R,A1L,ftemp2          ; m = bh*lt
   1052     FSTD    ftemp2,-8(%sp)          ;
   1053     XMPYU   A0R,A1R,ftemp3          ; lt = bl*lt
   1054     FSTD    ftemp3,-32(%sp)
   1055     XMPYU   A0L,A1L,ftemp4          ; ht = bh*ht
   1056     FSTD    ftemp4,-24(%sp)         ;
   1057 
   1058     LDD     -8(%sp),m               ; r21 = m
   1059     LDD     -16(%sp),m1             ; r19 = m1
   1060     ADD,L   m,m1,m                  ; m+m1
   1061 
   1062     DEPD,Z  m,31,32,temp3           ; (m+m1<<32)
   1063     LDD     -24(%sp),ht             ; r24 = ht
   1064 
   1065     CMPCLR,*>>= m,m1,%r0            ; if (m < m1)
   1066     ADD,L   ht,high_one,ht          ; ht+=high_one
   1067 
   1068     EXTRD,U m,31,32,temp1           ; m >> 32
   1069     LDD     -32(%sp),lt             ; lt
   1070     ADD,L   ht,temp1,ht             ; ht+= m>>32
   1071     ADD     lt,temp3,lt             ; lt = lt+m1
   1072     ADD,DC  ht,%r0,ht               ; ht++
   1073 
   1074     ADD     ht,ht,ht                ; ht=ht+ht;
   1075     ADD,DC  C3,%r0,C3               ; add in carry (c3++)
   1076 
   1077     ADD     lt,lt,lt                ; lt=lt+lt;
   1078     ADD,DC  ht,%r0,ht               ; add in carry (ht++)
   1079 
   1080     ADD     C1,lt,C1                ; c1=c1+lt
   1081     ADD,DC,*NUV ht,%r0,ht           ; add in carry (ht++)
   1082     LDO     1(C3),C3              ; bump c3 if overflow,nullify otherwise
   1083 
   1084     ADD     C2,ht,C2                ; c2 = c2 + ht
   1085     ADD,DC  C3,%r0,C3             ; add in carry (c3++)
   1086 .endm
   1087 
   1088 ;
   1089 ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
   1090 ; arg0 = r_ptr
   1091 ; arg1 = a_ptr
   1092 ;
   1093 
   1094 bn_sqr_comba8
   1095 	.PROC
   1096 	.CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
   1097 	.EXPORT	bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
   1098     .ENTRY
   1099 	.align 64
   1100 
   1101     STD     %r3,0(%sp)          ; save r3
   1102     STD     %r4,8(%sp)          ; save r4
   1103     STD     %r5,16(%sp)         ; save r5
   1104     STD     %r6,24(%sp)         ; save r6
   1105 
   1106 	;
   1107 	; Zero out carries
   1108 	;
   1109 	COPY     %r0,c1
   1110 	COPY     %r0,c2
   1111 	COPY     %r0,c3
   1112 
   1113 	LDO      128(%sp),%sp       ; bump stack
   1114     DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
   1115     DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
   1116 
   1117 	;
   1118 	; Load up all of the values we are going to use
   1119 	;
   1120     FLDD     0(a_ptr),a0
   1121     FLDD     8(a_ptr),a1
   1122     FLDD    16(a_ptr),a2
   1123     FLDD    24(a_ptr),a3
   1124     FLDD    32(a_ptr),a4
   1125     FLDD    40(a_ptr),a5
   1126     FLDD    48(a_ptr),a6
   1127     FLDD    56(a_ptr),a7
   1128 
   1129 	SQR_ADD_C a0L,a0R,c1,c2,c3
   1130 	STD     c1,0(r_ptr)          ; r[0] = c1;
   1131 	COPY    %r0,c1
   1132 
   1133 	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
   1134 	STD     c2,8(r_ptr)          ; r[1] = c2;
   1135 	COPY    %r0,c2
   1136 
   1137 	SQR_ADD_C a1L,a1R,c3,c1,c2
   1138 	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
   1139 	STD     c3,16(r_ptr)            ; r[2] = c3;
   1140 	COPY    %r0,c3
   1141 
   1142 	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
   1143 	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
   1144 	STD     c1,24(r_ptr)           ; r[3] = c1;
   1145 	COPY    %r0,c1
   1146 
   1147 	SQR_ADD_C a2L,a2R,c2,c3,c1
   1148 	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
   1149 	SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
   1150 	STD     c2,32(r_ptr)          ; r[4] = c2;
   1151 	COPY    %r0,c2
   1152 
   1153 	SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
   1154 	SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
   1155 	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
   1156 	STD     c3,40(r_ptr)          ; r[5] = c3;
   1157 	COPY    %r0,c3
   1158 
   1159 	SQR_ADD_C a3L,a3R,c1,c2,c3
   1160 	SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
   1161 	SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
   1162 	SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
   1163 	STD     c1,48(r_ptr)          ; r[6] = c1;
   1164 	COPY    %r0,c1
   1165 
   1166 	SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
   1167 	SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
   1168 	SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
   1169 	SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
   1170 	STD     c2,56(r_ptr)          ; r[7] = c2;
   1171 	COPY    %r0,c2
   1172 
   1173 	SQR_ADD_C a4L,a4R,c3,c1,c2
   1174 	SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
   1175 	SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
   1176 	SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
   1177 	STD     c3,64(r_ptr)          ; r[8] = c3;
   1178 	COPY    %r0,c3
   1179 
   1180 	SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
   1181 	SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
   1182 	SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
   1183 	STD     c1,72(r_ptr)          ; r[9] = c1;
   1184 	COPY    %r0,c1
   1185 
   1186 	SQR_ADD_C a5L,a5R,c2,c3,c1
   1187 	SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
   1188 	SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
   1189 	STD     c2,80(r_ptr)          ; r[10] = c2;
   1190 	COPY    %r0,c2
   1191 
   1192 	SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
   1193 	SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
   1194 	STD     c3,88(r_ptr)          ; r[11] = c3;
   1195 	COPY    %r0,c3
   1196 
   1197 	SQR_ADD_C a6L,a6R,c1,c2,c3
   1198 	SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
   1199 	STD     c1,96(r_ptr)          ; r[12] = c1;
   1200 	COPY    %r0,c1
   1201 
   1202 	SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
   1203 	STD     c2,104(r_ptr)         ; r[13] = c2;
   1204 	COPY    %r0,c2
   1205 
   1206 	SQR_ADD_C a7L,a7R,c3,c1,c2
   1207 	STD     c3, 112(r_ptr)       ; r[14] = c3
   1208 	STD     c1, 120(r_ptr)       ; r[15] = c1
   1209 
   1210     .EXIT
   1211     LDD     -104(%sp),%r6        ; restore r6
   1212     LDD     -112(%sp),%r5        ; restore r5
   1213     LDD     -120(%sp),%r4        ; restore r4
   1214     BVE     (%rp)
   1215     LDD,MB  -128(%sp),%r3
   1216 
   1217 	.PROCEND
   1218 
   1219 ;-----------------------------------------------------------------------------
   1220 ;
   1221 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
   1222 ; arg0 = r_ptr
   1223 ; arg1 = a_ptr
   1224 ;
   1225 
   1226 bn_sqr_comba4
   1227 	.proc
   1228 	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
   1229 	.EXPORT	bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
   1230     .entry
   1231 	.align 64
   1232     STD     %r3,0(%sp)          ; save r3
   1233     STD     %r4,8(%sp)          ; save r4
   1234     STD     %r5,16(%sp)         ; save r5
   1235     STD     %r6,24(%sp)         ; save r6
   1236 
   1237 	;
   1238 	; Zero out carries
   1239 	;
   1240 	COPY     %r0,c1
   1241 	COPY     %r0,c2
   1242 	COPY     %r0,c3
   1243 
   1244 	LDO      128(%sp),%sp       ; bump stack
   1245     DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
   1246     DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
   1247 
   1248 	;
   1249 	; Load up all of the values we are going to use
   1250 	;
   1251     FLDD     0(a_ptr),a0
   1252     FLDD     8(a_ptr),a1
   1253     FLDD    16(a_ptr),a2
   1254     FLDD    24(a_ptr),a3
   1255     FLDD    32(a_ptr),a4
   1256     FLDD    40(a_ptr),a5
   1257     FLDD    48(a_ptr),a6
   1258     FLDD    56(a_ptr),a7
   1259 
   1260 	SQR_ADD_C a0L,a0R,c1,c2,c3
   1261 
   1262 	STD     c1,0(r_ptr)          ; r[0] = c1;
   1263 	COPY    %r0,c1
   1264 
   1265 	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
   1266 
   1267 	STD     c2,8(r_ptr)          ; r[1] = c2;
   1268 	COPY    %r0,c2
   1269 
   1270 	SQR_ADD_C a1L,a1R,c3,c1,c2
   1271 	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
   1272 
   1273 	STD     c3,16(r_ptr)            ; r[2] = c3;
   1274 	COPY    %r0,c3
   1275 
   1276 	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
   1277 	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
   1278 
   1279 	STD     c1,24(r_ptr)           ; r[3] = c1;
   1280 	COPY    %r0,c1
   1281 
   1282 	SQR_ADD_C a2L,a2R,c2,c3,c1
   1283 	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
   1284 
   1285 	STD     c2,32(r_ptr)           ; r[4] = c2;
   1286 	COPY    %r0,c2
   1287 
   1288 	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
   1289 	STD     c3,40(r_ptr)           ; r[5] = c3;
   1290 	COPY    %r0,c3
   1291 
   1292 	SQR_ADD_C a3L,a3R,c1,c2,c3
   1293 	STD     c1,48(r_ptr)           ; r[6] = c1;
   1294 	STD     c2,56(r_ptr)           ; r[7] = c2;
   1295 
   1296     .EXIT
   1297     LDD     -104(%sp),%r6        ; restore r6
   1298     LDD     -112(%sp),%r5        ; restore r5
   1299     LDD     -120(%sp),%r4        ; restore r4
   1300     BVE     (%rp)
   1301     LDD,MB  -128(%sp),%r3
   1302 
   1303 	.PROCEND
   1304 
   1305 
   1306 ;---------------------------------------------------------------------------
   1307 
   1308 MUL_ADD_C  .macro  A0L,A0R,B0L,B0R,C1,C2,C3
   1309     XMPYU   A0L,B0R,ftemp1        ; m1 = bl*ht
   1310     FSTD    ftemp1,-16(%sp)       ;
   1311     XMPYU   A0R,B0L,ftemp2        ; m = bh*lt
   1312     FSTD    ftemp2,-8(%sp)        ;
   1313     XMPYU   A0R,B0R,ftemp3        ; lt = bl*lt
   1314     FSTD    ftemp3,-32(%sp)
   1315     XMPYU   A0L,B0L,ftemp4        ; ht = bh*ht
   1316     FSTD    ftemp4,-24(%sp)       ;
   1317 
   1318     LDD     -8(%sp),m             ; r21 = m
   1319     LDD     -16(%sp),m1           ; r19 = m1
   1320     ADD,L   m,m1,m                ; m+m1
   1321 
   1322     DEPD,Z  m,31,32,temp3         ; (m+m1<<32)
   1323     LDD     -24(%sp),ht           ; r24 = ht
   1324 
   1325     CMPCLR,*>>= m,m1,%r0          ; if (m < m1)
   1326     ADD,L   ht,high_one,ht        ; ht+=high_one
   1327 
   1328     EXTRD,U m,31,32,temp1         ; m >> 32
   1329     LDD     -32(%sp),lt           ; lt
   1330     ADD,L   ht,temp1,ht           ; ht+= m>>32
   1331     ADD     lt,temp3,lt           ; lt = lt+m1
   1332     ADD,DC  ht,%r0,ht             ; ht++
   1333 
   1334     ADD     C1,lt,C1              ; c1=c1+lt
   1335     ADD,DC  ht,%r0,ht             ; bump c3 if overflow,nullify otherwise
   1336 
   1337     ADD     C2,ht,C2              ; c2 = c2 + ht
   1338     ADD,DC  C3,%r0,C3             ; add in carry (c3++)
   1339 .endm
   1340 
   1341 
   1342 ;
   1343 ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   1344 ; arg0 = r_ptr
   1345 ; arg1 = a_ptr
   1346 ; arg2 = b_ptr
   1347 ;
   1348 
   1349 bn_mul_comba8
   1350 	.proc
   1351 	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
   1352 	.EXPORT	bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
   1353     .entry
   1354 	.align 64
   1355 
   1356     STD     %r3,0(%sp)          ; save r3
   1357     STD     %r4,8(%sp)          ; save r4
   1358     STD     %r5,16(%sp)         ; save r5
   1359     STD     %r6,24(%sp)         ; save r6
   1360     FSTD    %fr12,32(%sp)       ; save r6
   1361     FSTD    %fr13,40(%sp)       ; save r7
   1362 
   1363 	;
   1364 	; Zero out carries
   1365 	;
   1366 	COPY     %r0,c1
   1367 	COPY     %r0,c2
   1368 	COPY     %r0,c3
   1369 
   1370 	LDO      128(%sp),%sp       ; bump stack
   1371     DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
   1372 
   1373 	;
   1374 	; Load up all of the values we are going to use
   1375 	;
   1376     FLDD      0(a_ptr),a0
   1377     FLDD      8(a_ptr),a1
   1378     FLDD     16(a_ptr),a2
   1379     FLDD     24(a_ptr),a3
   1380     FLDD     32(a_ptr),a4
   1381     FLDD     40(a_ptr),a5
   1382     FLDD     48(a_ptr),a6
   1383     FLDD     56(a_ptr),a7
   1384 
   1385     FLDD      0(b_ptr),b0
   1386     FLDD      8(b_ptr),b1
   1387     FLDD     16(b_ptr),b2
   1388     FLDD     24(b_ptr),b3
   1389     FLDD     32(b_ptr),b4
   1390     FLDD     40(b_ptr),b5
   1391     FLDD     48(b_ptr),b6
   1392     FLDD     56(b_ptr),b7
   1393 
   1394 	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
   1395 	STD       c1,0(r_ptr)
   1396 	COPY      %r0,c1
   1397 
   1398 	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
   1399 	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
   1400 	STD       c2,8(r_ptr)
   1401 	COPY      %r0,c2
   1402 
   1403 	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
   1404 	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
   1405 	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
   1406 	STD       c3,16(r_ptr)
   1407 	COPY      %r0,c3
   1408 
   1409 	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
   1410 	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
   1411 	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
   1412 	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
   1413 	STD       c1,24(r_ptr)
   1414 	COPY      %r0,c1
   1415 
   1416 	MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
   1417 	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
   1418 	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
   1419 	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
   1420 	MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
   1421 	STD       c2,32(r_ptr)
   1422 	COPY      %r0,c2
   1423 
   1424 	MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
   1425 	MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
   1426 	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
   1427 	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
   1428 	MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
   1429 	MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
   1430 	STD       c3,40(r_ptr)
   1431 	COPY      %r0,c3
   1432 
   1433 	MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
   1434 	MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
   1435 	MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
   1436 	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
   1437 	MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
   1438 	MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
   1439 	MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
   1440 	STD       c1,48(r_ptr)
   1441 	COPY      %r0,c1
   1442 
   1443 	MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
   1444 	MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
   1445 	MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
   1446 	MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
   1447 	MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
   1448 	MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
   1449 	MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
   1450 	MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
   1451 	STD       c2,56(r_ptr)
   1452 	COPY      %r0,c2
   1453 
   1454 	MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
   1455 	MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
   1456 	MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
   1457 	MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
   1458 	MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
   1459 	MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
   1460 	MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
   1461 	STD       c3,64(r_ptr)
   1462 	COPY      %r0,c3
   1463 
   1464 	MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
   1465 	MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
   1466 	MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
   1467 	MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
   1468 	MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
   1469 	MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
   1470 	STD       c1,72(r_ptr)
   1471 	COPY      %r0,c1
   1472 
   1473 	MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
   1474 	MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
   1475 	MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
   1476 	MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
   1477 	MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
   1478 	STD       c2,80(r_ptr)
   1479 	COPY      %r0,c2
   1480 
   1481 	MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
   1482 	MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
   1483 	MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
   1484 	MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
   1485 	STD       c3,88(r_ptr)
   1486 	COPY      %r0,c3
   1487 
   1488 	MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
   1489 	MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
   1490 	MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
   1491 	STD       c1,96(r_ptr)
   1492 	COPY      %r0,c1
   1493 
   1494 	MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
   1495 	MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
   1496 	STD       c2,104(r_ptr)
   1497 	COPY      %r0,c2
   1498 
   1499 	MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
   1500 	STD       c3,112(r_ptr)
   1501 	STD       c1,120(r_ptr)
   1502 
   1503     .EXIT
   1504     FLDD    -88(%sp),%fr13
   1505     FLDD    -96(%sp),%fr12
   1506     LDD     -104(%sp),%r6        ; restore r6
   1507     LDD     -112(%sp),%r5        ; restore r5
   1508     LDD     -120(%sp),%r4        ; restore r4
   1509     BVE     (%rp)
   1510     LDD,MB  -128(%sp),%r3
   1511 
   1512 	.PROCEND
   1513 
   1514 ;-----------------------------------------------------------------------------
   1515 ;
   1516 ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   1517 ; arg0 = r_ptr
   1518 ; arg1 = a_ptr
   1519 ; arg2 = b_ptr
   1520 ;
   1521 
   1522 bn_mul_comba4
   1523 	.proc
   1524 	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
   1525 	.EXPORT	bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
   1526     .entry
   1527 	.align 64
   1528 
   1529     STD     %r3,0(%sp)          ; save r3
   1530     STD     %r4,8(%sp)          ; save r4
   1531     STD     %r5,16(%sp)         ; save r5
   1532     STD     %r6,24(%sp)         ; save r6
   1533     FSTD    %fr12,32(%sp)       ; save r6
   1534     FSTD    %fr13,40(%sp)       ; save r7
   1535 
   1536 	;
   1537 	; Zero out carries
   1538 	;
   1539 	COPY     %r0,c1
   1540 	COPY     %r0,c2
   1541 	COPY     %r0,c3
   1542 
   1543 	LDO      128(%sp),%sp       ; bump stack
   1544     DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
   1545 
   1546 	;
   1547 	; Load up all of the values we are going to use
   1548 	;
   1549     FLDD      0(a_ptr),a0
   1550     FLDD      8(a_ptr),a1
   1551     FLDD     16(a_ptr),a2
   1552     FLDD     24(a_ptr),a3
   1553 
   1554     FLDD      0(b_ptr),b0
   1555     FLDD      8(b_ptr),b1
   1556     FLDD     16(b_ptr),b2
   1557     FLDD     24(b_ptr),b3
   1558 
   1559 	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
   1560 	STD       c1,0(r_ptr)
   1561 	COPY      %r0,c1
   1562 
   1563 	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
   1564 	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
   1565 	STD       c2,8(r_ptr)
   1566 	COPY      %r0,c2
   1567 
   1568 	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
   1569 	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
   1570 	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
   1571 	STD       c3,16(r_ptr)
   1572 	COPY      %r0,c3
   1573 
   1574 	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
   1575 	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
   1576 	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
   1577 	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
   1578 	STD       c1,24(r_ptr)
   1579 	COPY      %r0,c1
   1580 
   1581 	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
   1582 	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
   1583 	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
   1584 	STD       c2,32(r_ptr)
   1585 	COPY      %r0,c2
   1586 
   1587 	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
   1588 	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
   1589 	STD       c3,40(r_ptr)
   1590 	COPY      %r0,c3
   1591 
   1592 	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
   1593 	STD       c1,48(r_ptr)
   1594 	STD       c2,56(r_ptr)
   1595 
   1596     .EXIT
   1597     FLDD    -88(%sp),%fr13
   1598     FLDD    -96(%sp),%fr12
   1599     LDD     -104(%sp),%r6        ; restore r6
   1600     LDD     -112(%sp),%r5        ; restore r5
   1601     LDD     -120(%sp),%r4        ; restore r4
   1602     BVE     (%rp)
   1603     LDD,MB  -128(%sp),%r3
   1604 
   1605 	.PROCEND
   1606 
   1607 
   1608 ;--- not PIC	.SPACE	$TEXT$
   1609 ;--- not PIC	.SUBSPA	$CODE$
   1610 ;--- not PIC	.SPACE	$PRIVATE$,SORT=16
   1611 ;--- not PIC	.IMPORT	$global$,DATA
   1612 ;--- not PIC	.SPACE	$TEXT$
   1613 ;--- not PIC	.SUBSPA	$CODE$
   1614 ;--- not PIC	.SUBSPA	$LIT$,ACCESS=0x2c
   1615 ;--- not PIC	C$7
   1616 ;--- not PIC	.ALIGN	8
   1617 ;--- not PIC	.STRINGZ	"Division would overflow (%d)\n"
   1618 	.END
   1619