Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # December 2005
     11 #
     12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
     13 # for undertaken effort are multiple. First of all, UltraSPARC is not
     14 # the whole SPARCv9 universe and other VIS-free implementations deserve
     15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
     16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
     17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
     18 # several integrated RSA/DSA accelerator circuits accessible through
     19 # kernel driver [only(*)], but having decent user-land software
     20 # implementation is important too. Finally, reasons like desire to
     21 # experiment with dedicated squaring procedure. Yes, this module
     22 # implements one, because it was easiest to draft it in SPARCv9
     23 # instructions...
     24 
     25 # (*)	Engine accessing the driver in question is on my TODO list.
     26 #	For reference, acceleator is estimated to give 6 to 10 times
     27 #	improvement on single-threaded RSA sign. It should be noted
     28 #	that 6-10x improvement coefficient does not actually mean
     29 #	something extraordinary in terms of absolute [single-threaded]
     30 #	performance, as SPARCv9 instruction set is by all means least
     31 #	suitable for high performance crypto among other 64 bit
     32 #	platforms. 6-10x factor simply places T1 in same performance
     33 #	domain as say AMD64 and IA-64. Improvement of RSA verify don't
     34 #	appear impressive at all, but it's the sign operation which is
     35 #	far more critical/interesting.
     36 
     37 # You might notice that inner loops are modulo-scheduled:-) This has
     38 # essentially negligible impact on UltraSPARC performance, it's
     39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
     40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
     41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
     42 # module still have hidden potential [see TODO list there], which is
     43 # estimated to be larger than 20%...
     44 
     45 # int bn_mul_mont(
     46 $rp="%i0";	# BN_ULONG *rp,
     47 $ap="%i1";	# const BN_ULONG *ap,
     48 $bp="%i2";	# const BN_ULONG *bp,
     49 $np="%i3";	# const BN_ULONG *np,
     50 $n0="%i4";	# const BN_ULONG *n0,
     51 $num="%i5";	# int num);
     52 
     53 $bits=32;
     54 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
     55 if ($bits==64)	{ $bias=2047; $frame=192; }
     56 else		{ $bias=0;    $frame=128; }
     57 
     58 $car0="%o0";
     59 $car1="%o1";
     60 $car2="%o2";	# 1 bit
     61 $acc0="%o3";
     62 $acc1="%o4";
     63 $mask="%g1";	# 32 bits, what a waste...
     64 $tmp0="%g4";
     65 $tmp1="%g5";
     66 
     67 $i="%l0";
     68 $j="%l1";
     69 $mul0="%l2";
     70 $mul1="%l3";
     71 $tp="%l4";
     72 $apj="%l5";
     73 $npj="%l6";
     74 $tpj="%l7";
     75 
     76 $fname="bn_mul_mont_int";
     77 
     78 $code=<<___;
     79 .section	".text",#alloc,#execinstr
     80 
     81 .global	$fname
     82 .align	32
     83 $fname:
     84 	cmp	%o5,4			! 128 bits minimum
     85 	bge,pt	%icc,.Lenter
     86 	sethi	%hi(0xffffffff),$mask
     87 	retl
     88 	clr	%o0
     89 .align	32
     90 .Lenter:
     91 	save	%sp,-$frame,%sp
     92 	sll	$num,2,$num		! num*=4
     93 	or	$mask,%lo(0xffffffff),$mask
     94 	ld	[$n0],$n0
     95 	cmp	$ap,$bp
     96 	and	$num,$mask,$num
     97 	ld	[$bp],$mul0		! bp[0]
     98 	nop
     99 
    100 	add	%sp,$bias,%o7		! real top of stack
    101 	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
    102 	sub	%o7,$num,%o7
    103 	ld	[$ap+4],$apj		! ap[1]
    104 	and	%o7,-1024,%o7
    105 	ld	[$np],$car1		! np[0]
    106 	sub	%o7,$bias,%sp		! alloca
    107 	ld	[$np+4],$npj		! np[1]
    108 	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
    109 	mov	12,$j
    110 
    111 	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
    112 	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
    113 	and	$car0,$mask,$acc0
    114 	add	%sp,$bias+$frame,$tp
    115 	ld	[$ap+8],$apj		!prologue!
    116 
    117 	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
    118 	and	$mul1,$mask,$mul1
    119 
    120 	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
    121 	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
    122 	srlx	$car0,32,$car0
    123 	add	$acc0,$car1,$car1
    124 	ld	[$np+8],$npj		!prologue!
    125 	srlx	$car1,32,$car1
    126 	mov	$tmp0,$acc0		!prologue!
    127 
    128 .L1st:
    129 	mulx	$apj,$mul0,$tmp0
    130 	mulx	$npj,$mul1,$tmp1
    131 	add	$acc0,$car0,$car0
    132 	ld	[$ap+$j],$apj		! ap[j]
    133 	and	$car0,$mask,$acc0
    134 	add	$acc1,$car1,$car1
    135 	ld	[$np+$j],$npj		! np[j]
    136 	srlx	$car0,32,$car0
    137 	add	$acc0,$car1,$car1
    138 	add	$j,4,$j			! j++
    139 	mov	$tmp0,$acc0
    140 	st	$car1,[$tp]
    141 	cmp	$j,$num
    142 	mov	$tmp1,$acc1
    143 	srlx	$car1,32,$car1
    144 	bl	%icc,.L1st
    145 	add	$tp,4,$tp		! tp++
    146 !.L1st
    147 
    148 	mulx	$apj,$mul0,$tmp0	!epilogue!
    149 	mulx	$npj,$mul1,$tmp1
    150 	add	$acc0,$car0,$car0
    151 	and	$car0,$mask,$acc0
    152 	add	$acc1,$car1,$car1
    153 	srlx	$car0,32,$car0
    154 	add	$acc0,$car1,$car1
    155 	st	$car1,[$tp]
    156 	srlx	$car1,32,$car1
    157 
    158 	add	$tmp0,$car0,$car0
    159 	and	$car0,$mask,$acc0
    160 	add	$tmp1,$car1,$car1
    161 	srlx	$car0,32,$car0
    162 	add	$acc0,$car1,$car1
    163 	st	$car1,[$tp+4]
    164 	srlx	$car1,32,$car1
    165 
    166 	add	$car0,$car1,$car1
    167 	st	$car1,[$tp+8]
    168 	srlx	$car1,32,$car2
    169 
    171 	mov	4,$i			! i++
    172 	ld	[$bp+4],$mul0		! bp[1]
    173 .Louter:
    174 	add	%sp,$bias+$frame,$tp
    175 	ld	[$ap],$car0		! ap[0]
    176 	ld	[$ap+4],$apj		! ap[1]
    177 	ld	[$np],$car1		! np[0]
    178 	ld	[$np+4],$npj		! np[1]
    179 	ld	[$tp],$tmp1		! tp[0]
    180 	ld	[$tp+4],$tpj		! tp[1]
    181 	mov	12,$j
    182 
    183 	mulx	$car0,$mul0,$car0
    184 	mulx	$apj,$mul0,$tmp0	!prologue!
    185 	add	$tmp1,$car0,$car0
    186 	ld	[$ap+8],$apj		!prologue!
    187 	and	$car0,$mask,$acc0
    188 
    189 	mulx	$n0,$acc0,$mul1
    190 	and	$mul1,$mask,$mul1
    191 
    192 	mulx	$car1,$mul1,$car1
    193 	mulx	$npj,$mul1,$acc1	!prologue!
    194 	srlx	$car0,32,$car0
    195 	add	$acc0,$car1,$car1
    196 	ld	[$np+8],$npj		!prologue!
    197 	srlx	$car1,32,$car1
    198 	mov	$tmp0,$acc0		!prologue!
    199 
    200 .Linner:
    201 	mulx	$apj,$mul0,$tmp0
    202 	mulx	$npj,$mul1,$tmp1
    203 	add	$tpj,$car0,$car0
    204 	ld	[$ap+$j],$apj		! ap[j]
    205 	add	$acc0,$car0,$car0
    206 	add	$acc1,$car1,$car1
    207 	ld	[$np+$j],$npj		! np[j]
    208 	and	$car0,$mask,$acc0
    209 	ld	[$tp+8],$tpj		! tp[j]
    210 	srlx	$car0,32,$car0
    211 	add	$acc0,$car1,$car1
    212 	add	$j,4,$j			! j++
    213 	mov	$tmp0,$acc0
    214 	st	$car1,[$tp]		! tp[j-1]
    215 	srlx	$car1,32,$car1
    216 	mov	$tmp1,$acc1
    217 	cmp	$j,$num
    218 	bl	%icc,.Linner
    219 	add	$tp,4,$tp		! tp++
    220 !.Linner
    221 
    222 	mulx	$apj,$mul0,$tmp0	!epilogue!
    223 	mulx	$npj,$mul1,$tmp1
    224 	add	$tpj,$car0,$car0
    225 	add	$acc0,$car0,$car0
    226 	ld	[$tp+8],$tpj		! tp[j]
    227 	and	$car0,$mask,$acc0
    228 	add	$acc1,$car1,$car1
    229 	srlx	$car0,32,$car0
    230 	add	$acc0,$car1,$car1
    231 	st	$car1,[$tp]		! tp[j-1]
    232 	srlx	$car1,32,$car1
    233 
    234 	add	$tpj,$car0,$car0
    235 	add	$tmp0,$car0,$car0
    236 	and	$car0,$mask,$acc0
    237 	add	$tmp1,$car1,$car1
    238 	add	$acc0,$car1,$car1
    239 	st	$car1,[$tp+4]		! tp[j-1]
    240 	srlx	$car0,32,$car0
    241 	add	$i,4,$i			! i++
    242 	srlx	$car1,32,$car1
    243 
    244 	add	$car0,$car1,$car1
    245 	cmp	$i,$num
    246 	add	$car2,$car1,$car1
    247 	st	$car1,[$tp+8]
    248 
    249 	srlx	$car1,32,$car2
    250 	bl,a	%icc,.Louter
    251 	ld	[$bp+$i],$mul0		! bp[i]
    252 !.Louter
    253 
    254 	add	$tp,12,$tp
    255 
    257 .Ltail:
    258 	add	$np,$num,$np
    259 	add	$rp,$num,$rp
    260 	mov	$tp,$ap
    261 	sub	%g0,$num,%o7		! k=-num
    262 	ba	.Lsub
    263 	subcc	%g0,%g0,%g0		! clear %icc.c
    264 .align	16
    265 .Lsub:
    266 	ld	[$tp+%o7],%o0
    267 	ld	[$np+%o7],%o1
    268 	subccc	%o0,%o1,%o1		! tp[j]-np[j]
    269 	add	$rp,%o7,$i
    270 	add	%o7,4,%o7
    271 	brnz	%o7,.Lsub
    272 	st	%o1,[$i]
    273 	subc	$car2,0,$car2		! handle upmost overflow bit
    274 	and	$tp,$car2,$ap
    275 	andn	$rp,$car2,$np
    276 	or	$ap,$np,$ap
    277 	sub	%g0,$num,%o7
    278 
    279 .Lcopy:
    280 	ld	[$ap+%o7],%o0		! copy or in-place refresh
    281 	st	%g0,[$tp+%o7]		! zap tp
    282 	st	%o0,[$rp+%o7]
    283 	add	%o7,4,%o7
    284 	brnz	%o7,.Lcopy
    285 	nop
    286 	mov	1,%i0
    287 	ret
    288 	restore
    289 ___
    290 
    292 ########
    293 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
    294 ######## code without following dedicated squaring procedure.
    295 ########
    296 $sbit="%i2";		# re-use $bp!
    297 
    298 $code.=<<___;
    299 .align	32
    300 .Lbn_sqr_mont:
    301 	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
    302 	mulx	$apj,$mul0,$tmp0		!prologue!
    303 	and	$car0,$mask,$acc0
    304 	add	%sp,$bias+$frame,$tp
    305 	ld	[$ap+8],$apj			!prologue!
    306 
    307 	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
    308 	srlx	$car0,32,$car0
    309 	and	$mul1,$mask,$mul1
    310 
    311 	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
    312 	mulx	$npj,$mul1,$acc1		!prologue!
    313 	and	$car0,1,$sbit
    314 	ld	[$np+8],$npj			!prologue!
    315 	srlx	$car0,1,$car0
    316 	add	$acc0,$car1,$car1
    317 	srlx	$car1,32,$car1
    318 	mov	$tmp0,$acc0			!prologue!
    319 
    320 .Lsqr_1st:
    321 	mulx	$apj,$mul0,$tmp0
    322 	mulx	$npj,$mul1,$tmp1
    323 	add	$acc0,$car0,$car0		! ap[j]*a0+c0
    324 	add	$acc1,$car1,$car1
    325 	ld	[$ap+$j],$apj			! ap[j]
    326 	and	$car0,$mask,$acc0
    327 	ld	[$np+$j],$npj			! np[j]
    328 	srlx	$car0,32,$car0
    329 	add	$acc0,$acc0,$acc0
    330 	or	$sbit,$acc0,$acc0
    331 	mov	$tmp1,$acc1
    332 	srlx	$acc0,32,$sbit
    333 	add	$j,4,$j				! j++
    334 	and	$acc0,$mask,$acc0
    335 	cmp	$j,$num
    336 	add	$acc0,$car1,$car1
    337 	st	$car1,[$tp]
    338 	mov	$tmp0,$acc0
    339 	srlx	$car1,32,$car1
    340 	bl	%icc,.Lsqr_1st
    341 	add	$tp,4,$tp			! tp++
    342 !.Lsqr_1st
    343 
    344 	mulx	$apj,$mul0,$tmp0		! epilogue
    345 	mulx	$npj,$mul1,$tmp1
    346 	add	$acc0,$car0,$car0		! ap[j]*a0+c0
    347 	add	$acc1,$car1,$car1
    348 	and	$car0,$mask,$acc0
    349 	srlx	$car0,32,$car0
    350 	add	$acc0,$acc0,$acc0
    351 	or	$sbit,$acc0,$acc0
    352 	srlx	$acc0,32,$sbit
    353 	and	$acc0,$mask,$acc0
    354 	add	$acc0,$car1,$car1
    355 	st	$car1,[$tp]
    356 	srlx	$car1,32,$car1
    357 
    358 	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
    359 	add	$tmp1,$car1,$car1
    360 	and	$car0,$mask,$acc0
    361 	srlx	$car0,32,$car0
    362 	add	$acc0,$acc0,$acc0
    363 	or	$sbit,$acc0,$acc0
    364 	srlx	$acc0,32,$sbit
    365 	and	$acc0,$mask,$acc0
    366 	add	$acc0,$car1,$car1
    367 	st	$car1,[$tp+4]
    368 	srlx	$car1,32,$car1
    369 
    370 	add	$car0,$car0,$car0
    371 	or	$sbit,$car0,$car0
    372 	add	$car0,$car1,$car1
    373 	st	$car1,[$tp+8]
    374 	srlx	$car1,32,$car2
    375 
    377 	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
    378 	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
    379 	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
    380 	ld	[$ap+4],$mul0			! ap[1]
    381 	ld	[$ap+8],$apj			! ap[2]
    382 	ld	[$np],$car1			! np[0]
    383 	ld	[$np+4],$npj			! np[1]
    384 	mulx	$n0,$tmp0,$mul1
    385 
    386 	mulx	$mul0,$mul0,$car0
    387 	and	$mul1,$mask,$mul1
    388 
    389 	mulx	$car1,$mul1,$car1
    390 	mulx	$npj,$mul1,$acc1
    391 	add	$tmp0,$car1,$car1
    392 	and	$car0,$mask,$acc0
    393 	ld	[$np+8],$npj			! np[2]
    394 	srlx	$car1,32,$car1
    395 	add	$tmp1,$car1,$car1
    396 	srlx	$car0,32,$car0
    397 	add	$acc0,$car1,$car1
    398 	and	$car0,1,$sbit
    399 	add	$acc1,$car1,$car1
    400 	srlx	$car0,1,$car0
    401 	mov	12,$j
    402 	st	$car1,[%sp+$bias+$frame]	! tp[0]=
    403 	srlx	$car1,32,$car1
    404 	add	%sp,$bias+$frame+4,$tp
    405 
    406 .Lsqr_2nd:
    407 	mulx	$apj,$mul0,$acc0
    408 	mulx	$npj,$mul1,$acc1
    409 	add	$acc0,$car0,$car0
    410 	add	$tpj,$car1,$car1
    411 	ld	[$ap+$j],$apj			! ap[j]
    412 	and	$car0,$mask,$acc0
    413 	ld	[$np+$j],$npj			! np[j]
    414 	srlx	$car0,32,$car0
    415 	add	$acc1,$car1,$car1
    416 	ld	[$tp+8],$tpj			! tp[j]
    417 	add	$acc0,$acc0,$acc0
    418 	add	$j,4,$j				! j++
    419 	or	$sbit,$acc0,$acc0
    420 	srlx	$acc0,32,$sbit
    421 	and	$acc0,$mask,$acc0
    422 	cmp	$j,$num
    423 	add	$acc0,$car1,$car1
    424 	st	$car1,[$tp]			! tp[j-1]
    425 	srlx	$car1,32,$car1
    426 	bl	%icc,.Lsqr_2nd
    427 	add	$tp,4,$tp			! tp++
    428 !.Lsqr_2nd
    429 
    430 	mulx	$apj,$mul0,$acc0
    431 	mulx	$npj,$mul1,$acc1
    432 	add	$acc0,$car0,$car0
    433 	add	$tpj,$car1,$car1
    434 	and	$car0,$mask,$acc0
    435 	srlx	$car0,32,$car0
    436 	add	$acc1,$car1,$car1
    437 	add	$acc0,$acc0,$acc0
    438 	or	$sbit,$acc0,$acc0
    439 	srlx	$acc0,32,$sbit
    440 	and	$acc0,$mask,$acc0
    441 	add	$acc0,$car1,$car1
    442 	st	$car1,[$tp]			! tp[j-1]
    443 	srlx	$car1,32,$car1
    444 
    445 	add	$car0,$car0,$car0
    446 	or	$sbit,$car0,$car0
    447 	add	$car0,$car1,$car1
    448 	add	$car2,$car1,$car1
    449 	st	$car1,[$tp+4]
    450 	srlx	$car1,32,$car2
    451 
    453 	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
    454 	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
    455 	ld	[$ap+8],$mul0			! ap[2]
    456 	ld	[$np],$car1			! np[0]
    457 	ld	[$np+4],$npj			! np[1]
    458 	mulx	$n0,$tmp1,$mul1
    459 	and	$mul1,$mask,$mul1
    460 	mov	8,$i
    461 
    462 	mulx	$mul0,$mul0,$car0
    463 	mulx	$car1,$mul1,$car1
    464 	and	$car0,$mask,$acc0
    465 	add	$tmp1,$car1,$car1
    466 	srlx	$car0,32,$car0
    467 	add	%sp,$bias+$frame,$tp
    468 	srlx	$car1,32,$car1
    469 	and	$car0,1,$sbit
    470 	srlx	$car0,1,$car0
    471 	mov	4,$j
    472 
    473 .Lsqr_outer:
    474 .Lsqr_inner1:
    475 	mulx	$npj,$mul1,$acc1
    476 	add	$tpj,$car1,$car1
    477 	add	$j,4,$j
    478 	ld	[$tp+8],$tpj
    479 	cmp	$j,$i
    480 	add	$acc1,$car1,$car1
    481 	ld	[$np+$j],$npj
    482 	st	$car1,[$tp]
    483 	srlx	$car1,32,$car1
    484 	bl	%icc,.Lsqr_inner1
    485 	add	$tp,4,$tp
    486 !.Lsqr_inner1
    487 
    488 	add	$j,4,$j
    489 	ld	[$ap+$j],$apj			! ap[j]
    490 	mulx	$npj,$mul1,$acc1
    491 	add	$tpj,$car1,$car1
    492 	ld	[$np+$j],$npj			! np[j]
    493 	add	$acc0,$car1,$car1
    494 	ld	[$tp+8],$tpj			! tp[j]
    495 	add	$acc1,$car1,$car1
    496 	st	$car1,[$tp]
    497 	srlx	$car1,32,$car1
    498 
    499 	add	$j,4,$j
    500 	cmp	$j,$num
    501 	be,pn	%icc,.Lsqr_no_inner2
    502 	add	$tp,4,$tp
    503 
    504 .Lsqr_inner2:
    505 	mulx	$apj,$mul0,$acc0
    506 	mulx	$npj,$mul1,$acc1
    507 	add	$tpj,$car1,$car1
    508 	add	$acc0,$car0,$car0
    509 	ld	[$ap+$j],$apj			! ap[j]
    510 	and	$car0,$mask,$acc0
    511 	ld	[$np+$j],$npj			! np[j]
    512 	srlx	$car0,32,$car0
    513 	add	$acc0,$acc0,$acc0
    514 	ld	[$tp+8],$tpj			! tp[j]
    515 	or	$sbit,$acc0,$acc0
    516 	add	$j,4,$j				! j++
    517 	srlx	$acc0,32,$sbit
    518 	and	$acc0,$mask,$acc0
    519 	cmp	$j,$num
    520 	add	$acc0,$car1,$car1
    521 	add	$acc1,$car1,$car1
    522 	st	$car1,[$tp]			! tp[j-1]
    523 	srlx	$car1,32,$car1
    524 	bl	%icc,.Lsqr_inner2
    525 	add	$tp,4,$tp			! tp++
    526 
    527 .Lsqr_no_inner2:
    528 	mulx	$apj,$mul0,$acc0
    529 	mulx	$npj,$mul1,$acc1
    530 	add	$tpj,$car1,$car1
    531 	add	$acc0,$car0,$car0
    532 	and	$car0,$mask,$acc0
    533 	srlx	$car0,32,$car0
    534 	add	$acc0,$acc0,$acc0
    535 	or	$sbit,$acc0,$acc0
    536 	srlx	$acc0,32,$sbit
    537 	and	$acc0,$mask,$acc0
    538 	add	$acc0,$car1,$car1
    539 	add	$acc1,$car1,$car1
    540 	st	$car1,[$tp]			! tp[j-1]
    541 	srlx	$car1,32,$car1
    542 
    543 	add	$car0,$car0,$car0
    544 	or	$sbit,$car0,$car0
    545 	add	$car0,$car1,$car1
    546 	add	$car2,$car1,$car1
    547 	st	$car1,[$tp+4]
    548 	srlx	$car1,32,$car2
    549 
    551 	add	$i,4,$i				! i++
    552 	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
    553 	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
    554 	ld	[$ap+$i],$mul0			! ap[j]
    555 	ld	[$np],$car1			! np[0]
    556 	ld	[$np+4],$npj			! np[1]
    557 	mulx	$n0,$tmp1,$mul1
    558 	and	$mul1,$mask,$mul1
    559 	add	$i,4,$tmp0
    560 
    561 	mulx	$mul0,$mul0,$car0
    562 	mulx	$car1,$mul1,$car1
    563 	and	$car0,$mask,$acc0
    564 	add	$tmp1,$car1,$car1
    565 	srlx	$car0,32,$car0
    566 	add	%sp,$bias+$frame,$tp
    567 	srlx	$car1,32,$car1
    568 	and	$car0,1,$sbit
    569 	srlx	$car0,1,$car0
    570 
    571 	cmp	$tmp0,$num			! i<num-1
    572 	bl	%icc,.Lsqr_outer
    573 	mov	4,$j
    574 
    576 .Lsqr_last:
    577 	mulx	$npj,$mul1,$acc1
    578 	add	$tpj,$car1,$car1
    579 	add	$j,4,$j
    580 	ld	[$tp+8],$tpj
    581 	cmp	$j,$i
    582 	add	$acc1,$car1,$car1
    583 	ld	[$np+$j],$npj
    584 	st	$car1,[$tp]
    585 	srlx	$car1,32,$car1
    586 	bl	%icc,.Lsqr_last
    587 	add	$tp,4,$tp
    588 !.Lsqr_last
    589 
    590 	mulx	$npj,$mul1,$acc1
    591 	add	$tpj,$car1,$car1
    592 	add	$acc0,$car1,$car1
    593 	add	$acc1,$car1,$car1
    594 	st	$car1,[$tp]
    595 	srlx	$car1,32,$car1
    596 
    597 	add	$car0,$car0,$car0		! recover $car0
    598 	or	$sbit,$car0,$car0
    599 	add	$car0,$car1,$car1
    600 	add	$car2,$car1,$car1
    601 	st	$car1,[$tp+4]
    602 	srlx	$car1,32,$car2
    603 
    604 	ba	.Ltail
    605 	add	$tp,8,$tp
    606 .type	$fname,#function
    607 .size	$fname,(.-$fname)
    608 .asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
    609 .align	32
    610 ___
    611 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
    612 print $code;
    613 close STDOUT;
    614