Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # January 2010
     11 #
     12 # "Teaser" Montgomery multiplication module for IA-64. There are
     13 # several possibilities for improvement:
     14 #
     15 # - modulo-scheduling outer loop would eliminate quite a number of
     16 #   stalls after ldf8, xma and getf.sig outside inner loop and
     17 #   improve shorter key performance;
     18 # - shorter vector support [with input vectors being fetched only
     19 #   once] should be added;
     20 # - 2x unroll with help of n0[1] would make the code scalable on
     21 #   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
     22 #   acute interest, because upcoming Tukwila's individual cores are
     23 #   reportedly based on Itanium 2 design;
     24 # - dedicated squaring procedure(?);
     25 #
     26 # January 2010
     27 #
     28 # Shorter vector support is implemented by zero-padding ap and np
     29 # vectors up to 8 elements, or 512 bits. This means that 256-bit
     30 # inputs will be processed only 2 times faster than 512-bit inputs,
     31 # not 4 [as one would expect, because algorithm complexity is n^2].
     32 # The reason for padding is that inputs shorter than 512 bits won't
     33 # be processed faster anyway, because minimal critical path of the
     34 # core loop happens to match 512-bit timing. Either way, it resulted
     35 # in >100% improvement of 512-bit RSA sign benchmark and 50% - of
     36 # 1024-bit one [in comparison to original version of *this* module].
     37 #
     38 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
     39 # this module is:
     40 #                   sign    verify    sign/s verify/s
     41 # rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
     42 # rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
     43 # rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
     44 # rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
     45 # dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
     46 # dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
     47 # dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
     48 #
     49 # ... and *without* (but still with ia64.S):
     50 #
     51 # rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
     52 # rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
     53 # rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
     54 # rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
     55 # dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
     56 # dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
     57 # dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
     58 #
     59 # As it can be seen, RSA sign performance improves by 130-30%,
     60 # hereafter less for longer keys, while verify - by 74-13%.
     61 # DSA performance improves by 115-30%.
     62 
     63 if ($^O eq "hpux") {
     64     $ADDP="addp4";
     65     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
     66 } else { $ADDP="add"; }
     67 
     68 $code=<<___;
     69 .explicit
     70 .text
     71 
     72 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
     73 //		    const BN_ULONG *bp,const BN_ULONG *np,
     74 //		    const BN_ULONG *n0p,int num);			
     75 .align	64
     76 .global	bn_mul_mont#
     77 .proc	bn_mul_mont#
     78 bn_mul_mont:
     79 	.prologue
     80 	.body
     81 { .mmi;	cmp4.le		p6,p7=2,r37;;
     82 (p6)	cmp4.lt.unc	p8,p9=8,r37
     83 	mov		ret0=r0		};;
     84 { .bbb;
     85 (p9)	br.cond.dptk.many	bn_mul_mont_8
     86 (p8)	br.cond.dpnt.many	bn_mul_mont_general
     87 (p7)	br.ret.spnt.many	b0	};;
     88 .endp	bn_mul_mont#
     89 
     91 prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
     92 
     93 rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
     94 tptr=r16;	// &tp[0]
     95 tp_1=r17;	// &tp[-1]
     96 num=r18;	len=r19;	lc=r20;
     97 topbit=r21;	// carry bit from tmp[num]
     98 
     99 n0=f6;
    100 m0=f7;
    101 bi=f8;
    102 
    103 .align	64
    104 .local	bn_mul_mont_general#
    105 .proc	bn_mul_mont_general#
    106 bn_mul_mont_general:
    107 	.prologue
    108 { .mmi;	.save	ar.pfs,prevfs
    109 	alloc	prevfs=ar.pfs,6,2,0,8
    110 	$ADDP	aptr=0,in1
    111 	.save	ar.lc,prevlc
    112 	mov	prevlc=ar.lc		}
    113 { .mmi;	.vframe	prevsp
    114 	mov	prevsp=sp
    115 	$ADDP	bptr=0,in2
    116 	.save	pr,prevpr
    117 	mov	prevpr=pr		};;
    118 
    119 	.body
    120 	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
    121 	.rotr		a[3],n[3],t[2]
    122 
    123 { .mmi;	ldf8		bi=[bptr],8		// (*bp++)
    124 	ldf8		alo[4]=[aptr],16	// ap[0]
    125 	$ADDP		r30=8,in1	};;
    126 { .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
    127 	ldf8		alo[2]=[aptr],16	// ap[2]
    128 	$ADDP		in4=0,in4	};;
    129 { .mmi;	ldf8		alo[1]=[r30]		// ap[3]
    130 	ldf8		n0=[in4]		// n0
    131 	$ADDP		rptr=0,in0		}
    132 { .mmi;	$ADDP		nptr=0,in3
    133 	mov		r31=16
    134 	zxt4		num=in5		};;
    135 { .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
    136 	shladd		len=num,3,r0
    137 	shladd		r31=num,3,r31	};;
    138 { .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
    139 	add		lc=-5,num
    140 	sub		r31=sp,r31	};;
    141 { .mfb;	and		sp=-16,r31		// alloca
    142 	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
    143 	nop.b		0		}
    144 { .mfb;	nop.m		0
    145 	xmpy.lu		alo[4]=alo[4],bi
    146 	brp.loop.imp	.L1st_ctop,.L1st_cend-16
    147 					};;
    148 { .mfi;	nop.m		0
    149 	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
    150 	add		tp_1=8,sp	}
    151 { .mfi;	nop.m		0
    152 	xma.lu		alo[3]=alo[3],bi,ahi[2]
    153 	mov		pr.rot=0x20001f<<16
    154 			// ------^----- (p40) at first (p23)
    155 			// ----------^^ p[16:20]=1
    156 					};;
    157 { .mfi;	nop.m		0
    158 	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
    159 	mov		ar.lc=lc	}
    160 { .mfi;	nop.m		0
    161 	fcvt.fxu.s1	nhi[1]=f0
    162 	mov		ar.ec=8		};;
    163 
    164 .align	32
    165 .L1st_ctop:
    166 .pred.rel	"mutex",p40,p42
    167 { .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
    168 	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
    169 	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
    170 { .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
    171 	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
    172 	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
    173 { .mfi;	(p21)	getf.sig	a[0]=alo[5]
    174 	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
    175 	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
    176 { .mfi;	(p23)	st8		[tp_1]=n[2],8
    177 	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
    178 	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
    179 { .mmb;	(p21)	getf.sig	n[0]=nlo[3]
    180 	(p16)	nop.m		0
    181 	br.ctop.sptk	.L1st_ctop			};;
    182 .L1st_cend:
    183 
    184 { .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
    185 	getf.sig	n[0]=nhi[4]
    186 	add		num=-1,num	};;	// num--
    187 { .mmi;	.pred.rel	"mutex",p40,p42
    188 (p40)	add		n[0]=n[0],a[0]
    189 (p42)	add		n[0]=n[0],a[0],1
    190 	sub		aptr=aptr,len	};;	// rewind
    191 { .mmi;	.pred.rel	"mutex",p40,p42
    192 (p40)	cmp.ltu		p41,p39=n[0],a[0]
    193 (p42)	cmp.leu		p41,p39=n[0],a[0]
    194 	sub		nptr=nptr,len	};;
    195 { .mmi;	.pred.rel	"mutex",p39,p41
    196 (p39)	add		topbit=r0,r0
    197 (p41)	add		topbit=r0,r0,1
    198 	nop.i		0		}	
    199 { .mmi;	st8		[tp_1]=n[0]
    200 	add		tptr=16,sp
    201 	add		tp_1=8,sp	};;
    202 
    204 .Louter:
    205 { .mmi;	ldf8		bi=[bptr],8		// (*bp++)
    206 	ldf8		ahi[3]=[tptr]		// tp[0]
    207 	add		r30=8,aptr	};;
    208 { .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
    209 	ldf8		alo[3]=[r30],16		// ap[1]
    210 	add		r31=8,nptr	};;
    211 { .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
    212 	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
    213 	brp.loop.imp	.Linner_ctop,.Linner_cend-16
    214 					}
    215 { .mfb;	ldf8		alo[1]=[r30]		// ap[3]
    216 	xma.lu		alo[4]=alo[4],bi,ahi[3]
    217 	clrrrb.pr			};;
    218 { .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
    219 	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
    220 	nop.i		0		}
    221 { .mfi;	ldf8		nlo[1]=[r31]		// np[1]
    222 	xma.lu		alo[3]=alo[3],bi,ahi[2]
    223 	mov		pr.rot=0x20101f<<16
    224 			// ------^----- (p40) at first (p23)
    225 			// --------^--- (p30) at first (p22)
    226 			// ----------^^ p[16:20]=1
    227 					};;
    228 { .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
    229 	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
    230 	mov		ar.lc=lc	}
    231 { .mfi;
    232 	fcvt.fxu.s1	nhi[1]=f0
    233 	mov		ar.ec=8		};;
    234 
    235 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
    236 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
    237 // in latter case accounts for two-tick pipeline stall, which means
    238 // that its performance would be ~20% lower than optimal one. No
    239 // attempt was made to address this, because original Itanium is
    240 // hardly represented out in the wild...
    241 .align	32
    242 .Linner_ctop:
    243 .pred.rel	"mutex",p40,p42
    244 .pred.rel	"mutex",p30,p32
    245 { .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
    246 	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
    247 	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
    248 { .mfi;	(p16)	nop.m		0
    249 	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
    250 	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
    251 { .mfi;	(p21)	getf.sig	a[0]=alo[5]
    252 	(p16)	nop.f		0
    253 	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
    254 { .mfi;	(p21)	ld8		t[0]=[tptr],8
    255 	(p16)	nop.f		0
    256 	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
    257 { .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
    258 	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
    259 	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
    260 { .mfi;	(p16)	nop.m		0
    261 	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
    262 	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
    263 { .mmi;	(p21)	getf.sig	n[0]=nlo[3]
    264 	(p16)	nop.m		0
    265 	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
    266 { .mmb;	(p23)	st8		[tp_1]=n[2],8
    267 	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
    268 	br.ctop.sptk	.Linner_ctop			};;
    269 .Linner_cend:
    270 
    271 { .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
    272 	getf.sig	n[0]=nhi[4]
    273 	nop.i		0		};;
    274 
    275 { .mmi;	.pred.rel	"mutex",p31,p33
    276 (p31)	add		a[0]=a[0],topbit
    277 (p33)	add		a[0]=a[0],topbit,1
    278 	mov		topbit=r0	};;
    279 { .mfi; .pred.rel	"mutex",p31,p33
    280 (p31)	cmp.ltu		p32,p30=a[0],topbit
    281 (p33)	cmp.leu		p32,p30=a[0],topbit
    282 					}
    283 { .mfi;	.pred.rel	"mutex",p40,p42
    284 (p40)	add		n[0]=n[0],a[0]
    285 (p42)	add		n[0]=n[0],a[0],1
    286 					};;
    287 { .mmi;	.pred.rel	"mutex",p44,p46
    288 (p40)	cmp.ltu		p41,p39=n[0],a[0]
    289 (p42)	cmp.leu		p41,p39=n[0],a[0]
    290 (p32)	add		topbit=r0,r0,1	}
    291 
    292 { .mmi;	st8		[tp_1]=n[0],8
    293 	cmp4.ne		p6,p0=1,num
    294 	sub		aptr=aptr,len	};;	// rewind
    295 { .mmi;	sub		nptr=nptr,len
    296 (p41)	add		topbit=r0,r0,1
    297 	add		tptr=16,sp	}
    298 { .mmb;	add		tp_1=8,sp
    299 	add		num=-1,num		// num--
    300 (p6)	br.cond.sptk.many	.Louter	};;
    301 
    303 { .mbb;	add		lc=4,lc
    304 	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
    305 	clrrrb.pr			};;
    306 { .mii;	nop.m		0
    307 	mov		pr.rot=0x10001<<16
    308 			// ------^---- (p33) at first (p17)
    309 	mov		ar.lc=lc	}
    310 { .mii;	nop.m		0
    311 	mov		ar.ec=3
    312 	nop.i		0		};;
    313 
    314 .Lsub_ctop:
    315 .pred.rel	"mutex",p33,p35
    316 { .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
    317 	(p16)	nop.f		0
    318 	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
    319 { .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
    320 	(p16)	nop.f		0
    321 	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
    322 { .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
    323 	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
    324 	(p18)	nop.b		0			}
    325 { .mib;	(p18)	nop.m		0
    326 	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
    327 	br.ctop.sptk	.Lsub_ctop			};;
    328 .Lsub_cend:
    329 
    330 { .mmb;	.pred.rel	"mutex",p34,p36
    331 (p34)	sub	topbit=topbit,r0	// (p19)
    332 (p36)	sub	topbit=topbit,r0,1
    333 	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
    334 					}
    335 { .mmb;	sub	rptr=rptr,len		// rewind
    336 	sub	tptr=tptr,len
    337 	clrrrb.pr			};;
    338 { .mmi;	and	aptr=tptr,topbit
    339 	andcm	bptr=rptr,topbit
    340 	mov	pr.rot=1<<16		};;
    341 { .mii;	or	nptr=aptr,bptr
    342 	mov	ar.lc=lc
    343 	mov	ar.ec=3			};;
    344 
    345 .Lcopy_ctop:
    346 { .mmb;	(p16)	ld8	n[0]=[nptr],8
    347 	(p18)	st8	[tptr]=r0,8
    348 	(p16)	nop.b	0		}
    349 { .mmb;	(p16)	nop.m	0
    350 	(p18)	st8	[rptr]=n[2],8
    351 	br.ctop.sptk	.Lcopy_ctop	};;
    352 .Lcopy_cend:
    353 
    354 { .mmi;	mov		ret0=1			// signal "handled"
    355 	rum		1<<5			// clear um.mfh
    356 	mov		ar.lc=prevlc	}
    357 { .mib;	.restore	sp
    358 	mov		sp=prevsp
    359 	mov		pr=prevpr,0x1ffff
    360 	br.ret.sptk.many	b0	};;
    361 .endp	bn_mul_mont_general#
    362 
    364 a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
    365 n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
    366 t0=r15;
    367 
    368 ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
    369 ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
    370 
    371 .align	64
    372 .skip	48		// aligns loop body
    373 .local	bn_mul_mont_8#
    374 .proc	bn_mul_mont_8#
    375 bn_mul_mont_8:
    376 	.prologue
    377 { .mmi;	.save		ar.pfs,prevfs
    378 	alloc		prevfs=ar.pfs,6,2,0,8
    379 	.vframe		prevsp
    380 	mov		prevsp=sp
    381 	.save		ar.lc,prevlc
    382 	mov		prevlc=ar.lc	}
    383 { .mmi;	add		r17=-6*16,sp
    384 	add		sp=-7*16,sp
    385 	.save		pr,prevpr
    386 	mov		prevpr=pr	};;
    387 
    388 { .mmi;	.save.gf	0,0x10
    389 	stf.spill	[sp]=f16,-16
    390 	.save.gf	0,0x20
    391 	stf.spill	[r17]=f17,32
    392 	add		r16=-5*16,prevsp};;
    393 { .mmi;	.save.gf	0,0x40
    394 	stf.spill	[r16]=f18,32
    395 	.save.gf	0,0x80
    396 	stf.spill	[r17]=f19,32
    397 	$ADDP		aptr=0,in1	};;
    398 { .mmi;	.save.gf	0,0x100
    399 	stf.spill	[r16]=f20,32
    400 	.save.gf	0,0x200
    401 	stf.spill	[r17]=f21,32
    402 	$ADDP		r29=8,in1	};;
    403 { .mmi;	.save.gf	0,0x400
    404 	stf.spill	[r16]=f22
    405 	.save.gf	0,0x800
    406 	stf.spill	[r17]=f23
    407 	$ADDP		rptr=0,in0	};;
    408 
    410 	.body
    411 	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
    412 	.rotr		t[8]
    413 
    414 // load input vectors padding them to 8 elements
    415 { .mmi;	ldf8		ai0=[aptr],16		// ap[0]
    416 	ldf8		ai1=[r29],16		// ap[1]
    417 	$ADDP		bptr=0,in2	}
    418 { .mmi;	$ADDP		r30=8,in2
    419 	$ADDP		nptr=0,in3
    420 	$ADDP		r31=8,in3	};;
    421 { .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
    422 	ldf8		bj[6]=[r30],16		// bp[1]
    423 	cmp4.le		p4,p5=3,in5	}
    424 { .mmi;	ldf8		ni0=[nptr],16		// np[0]
    425 	ldf8		ni1=[r31],16		// np[1]
    426 	cmp4.le		p6,p7=4,in5	};;
    427 
    428 { .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
    429 	(p5)fcvt.fxu	ai2=f0
    430 	cmp4.le		p8,p9=5,in5	}
    431 { .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
    432 	(p7)fcvt.fxu	ai3=f0
    433 	cmp4.le		p10,p11=6,in5	}
    434 { .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
    435 	(p5)fcvt.fxu	bj[5]=f0
    436 	cmp4.le		p12,p13=7,in5	}
    437 { .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
    438 	(p7)fcvt.fxu	bj[4]=f0
    439 	cmp4.le		p14,p15=8,in5	}
    440 { .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
    441 	(p5)fcvt.fxu	ni2=f0
    442 	addp4		r28=-1,in5	}
    443 { .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
    444 	(p7)fcvt.fxu	ni3=f0
    445 	$ADDP		in4=0,in4	};;
    446 
    447 { .mfi;	ldf8		n0=[in4]
    448 	fcvt.fxu	tf[1]=f0
    449 	nop.i		0		}
    450 
    451 { .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
    452 	(p9)fcvt.fxu	ai4=f0
    453 	mov		t[0]=r0		}
    454 { .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
    455 	(p11)fcvt.fxu	ai5=f0
    456 	mov		t[1]=r0		}
    457 { .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
    458 	(p9)fcvt.fxu	bj[3]=f0
    459 	mov		t[2]=r0		}
    460 { .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
    461 	(p11)fcvt.fxu	bj[2]=f0
    462 	mov		t[3]=r0		}
    463 { .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
    464 	(p9)fcvt.fxu	ni4=f0
    465 	mov		t[4]=r0		}
    466 { .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
    467 	(p11)fcvt.fxu	ni5=f0
    468 	mov		t[5]=r0		};;
    469 
    470 { .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
    471 	(p13)fcvt.fxu	ai6=f0
    472 	mov		t[6]=r0		}
    473 { .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
    474 	(p15)fcvt.fxu	ai7=f0
    475 	mov		t[7]=r0		}
    476 { .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
    477 	(p13)fcvt.fxu	bj[1]=f0
    478 	mov		ar.lc=r28	}
    479 { .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
    480 	(p15)fcvt.fxu	bj[0]=f0
    481 	mov		ar.ec=1		}
    482 { .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
    483 	(p13)fcvt.fxu	ni6=f0
    484 	mov		pr.rot=1<<16	}
    485 { .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
    486 	(p15)fcvt.fxu	ni7=f0
    487 	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
    488 					};;
    489 
    491 // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
    492 // to measure with help of Interval Time Counter indicated that the
    493 // factor is a tad higher: 33 or 34, if not 35. Exact measurement and
    494 // addressing the issue is problematic, because I don't have access
    495 // to platform-specific instruction-level profiler. On Itanium it
    496 // should run in 56*n ticks, because of higher xma latency...
    497 .Louter_8_ctop:
    498 	.pred.rel		"mutex",p40,p42
    499 	.pred.rel		"mutex",p48,p50
    500 { .mfi;	(p16)	nop.m		0			// 0:
    501 	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
    502 	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
    503 { .mfi;	(p42)	add		a3=a3,n3,1
    504 	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
    505 	(p16)	nop.i		0		};;
    506 { .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
    507 	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
    508 	(p50)	add		t[6]=t[6],a3,1	};;
    509 { .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
    510 	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
    511 	(p40)	cmp.ltu		p43,p41=a3,n3	}
    512 { .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
    513 	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
    514 	(p16)	nop.i		0		};;
    515 { .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
    516 	(p48)	cmp.ltu		p51,p49=t[6],a3
    517 	(p50)	cmp.leu		p51,p49=t[6],a3	};;
    518 	.pred.rel		"mutex",p41,p43
    519 	.pred.rel		"mutex",p49,p51
    520 { .mfi;	(p16)	nop.m		0			// 4:
    521 	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
    522 	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
    523 { .mfi;	(p43)	add		a4=a4,n4,1
    524 	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
    525 	(p16)	nop.i		0		};;
    526 { .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
    527 	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
    528 	(p51)	add		t[5]=t[5],a4,1	};;
    529 { .mfi;	(p16)	nop.m		0			// 6:
    530 	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
    531 	(p41)	cmp.ltu		p42,p40=a4,n4	}
    532 { .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
    533 	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
    534 	(p16)	nop.i		0		};;
    535 { .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
    536 	(p49)	cmp.ltu		p50,p48=t[5],a4
    537 	(p51)	cmp.leu		p50,p48=t[5],a4	};;
    538 	.pred.rel		"mutex",p40,p42
    539 	.pred.rel		"mutex",p48,p50
    540 { .mfi;	(p16)	nop.m		0			// 8:
    541 	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
    542 	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
    543 { .mfi;	(p42)	add		a5=a5,n5,1
    544 	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
    545 	(p16)	nop.i		0		};;
    546 { .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
    547 	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
    548 	(p50)	add		t[4]=t[4],a5,1	};;
    549 { .mfi;	(p16)	nop.m		0			// 10:
    550 	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
    551 	(p40)	cmp.ltu		p43,p41=a5,n5	}
    552 { .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
    553 	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
    554 	(p16)	nop.i		0		};;
    555 { .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
    556 	(p48)	cmp.ltu		p51,p49=t[4],a5
    557 	(p50)	cmp.leu		p51,p49=t[4],a5	};;
    558 	.pred.rel		"mutex",p41,p43
    559 	.pred.rel		"mutex",p49,p51
    560 { .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
    561 	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
    562 	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
    563 { .mfi;	(p43)	add		a6=a6,n6,1
    564 	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
    565 	(p16)	nop.i		0		};;
    566 { .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
    567 	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
    568 	(p51)	add		t[3]=t[3],a6,1	};;
    569 { .mfi;	(p16)	nop.m		0			// 14:
    570 	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
    571 	(p41)	cmp.ltu		p42,p40=a6,n6	}
    572 { .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
    573 	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
    574 	(p16)	nop.i		0		};;
    575 { .mii;	(p16)	nop.m		0			// 15:
    576 	(p49)	cmp.ltu		p50,p48=t[3],a6
    577 	(p51)	cmp.leu		p50,p48=t[3],a6	};;
    578 	.pred.rel		"mutex",p40,p42
    579 	.pred.rel		"mutex",p48,p50
    580 { .mfi;	(p16)	nop.m		0			// 16:
    581 	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
    582 	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
    583 { .mfi;	(p42)	add		a7=a7,n7,1
    584 	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
    585 	(p16)	nop.i		0		};;
    586 { .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
    587 	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
    588 	(p50)	add		t[2]=t[2],a7,1	};;
    589 { .mfi;	(p16)	nop.m		0			// 18:
    590 	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
    591 	(p40)	cmp.ltu		p43,p41=a7,n7	}
    592 { .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
    593 	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
    594 	(p16)	nop.i		0		};;
    595 { .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
    596 	(p48)	cmp.ltu		p51,p49=t[2],a7
    597 	(p50)	cmp.leu		p51,p49=t[2],a7	};;
    598 	.pred.rel		"mutex",p41,p43
    599 	.pred.rel		"mutex",p49,p51
    600 { .mfi;	(p16)	nop.m		0			// 20:
    601 	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
    602 	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
    603 { .mfi;	(p43)	add		a8=a8,n8,1
    604 	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
    605 	(p16)	nop.i		0		};;
    606 { .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
    607 	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
    608 	(p51)	add		t[1]=t[1],a8,1	};;
    609 { .mfi;	(p16)	nop.m		0			// 22:
    610 	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
    611 	(p41)	cmp.ltu		p42,p40=a8,n8	}
    612 { .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
    613 	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
    614 	(p16)	nop.i		0		};;
    615 { .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
    616 	(p49)	cmp.ltu		p50,p48=t[1],a8
    617 	(p51)	cmp.leu		p50,p48=t[1],a8	};;
    618 { .mfi;	(p16)	nop.m		0			// 24:
    619 	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
    620 	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
    621 { .mfi;	(p16)	nop.m		0
    622 	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
    623 	(p17)	mov		t[0]=r0		};;
    624 { .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
    625 	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
    626 	(p42)	add		t[0]=t[0],r0,1	};;
    627 { .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
    628 	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
    629 	(p50)	add		t[0]=t[0],r0,1	}
    630 { .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
    631 	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
    632 	(p16)	nop.i		0		};;
    633 { .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
    634 	(p16)	cmp.ltu.unc	p50,p48=t0,a1
    635 	(p16)	nop.i		0		};;
    636 	.pred.rel		"mutex",p40,p42
    637 	.pred.rel		"mutex",p48,p50
    638 { .mfi;	(p16)	nop.m		0			// 28:
    639 	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
    640 	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
    641 { .mfi;	(p42)	add		a2=a2,n2,1
    642 	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
    643 	(p16)	nop.i		0		};;
    644 { .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
    645 	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
    646 	(p50)	add		t[6]=t[6],a2,1	};;
    647 { .mfi;	(p16)	nop.m		0			// 30:
    648 	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
    649 	(p40)	cmp.ltu		p41,p39=a2,n2	}
    650 { .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
    651 	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
    652 	(p16)	nop.i		0		};;
    653 { .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
    654 	(p16)	nop.f		0
    655 	(p48)	cmp.ltu		p49,p47=t[6],a2	}
    656 { .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
    657 	(p16)	nop.f		0
    658 	br.ctop.sptk.many	.Louter_8_ctop	};;
    659 .Louter_8_cend:
    660 
    662 // above loop has to execute one more time, without (p16), which is
    663 // replaced with merged move of np[8] to GPR bank
    664 	.pred.rel		"mutex",p40,p42
    665 	.pred.rel		"mutex",p48,p50
    666 { .mmi;	(p0)	getf.sig	n1=ni0			// 0:
    667 	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
    668 	(p42)	add		a3=a3,n3,1	};;
    669 { .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
    670 	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
    671 	(p50)	add		t[6]=t[6],a3,1	};;
    672 { .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
    673 	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
    674 	(p40)	cmp.ltu		p43,p41=a3,n3	}
    675 { .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
    676 	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
    677 	(p0)	nop.i		0		};;
    678 { .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
    679 	(p48)	cmp.ltu		p51,p49=t[6],a3
    680 	(p50)	cmp.leu		p51,p49=t[6],a3	};;
    681 	.pred.rel		"mutex",p41,p43
    682 	.pred.rel		"mutex",p49,p51
    683 { .mmi;	(p0)	getf.sig	n2=ni1			// 4:
    684 	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
    685 	(p43)	add		a4=a4,n4,1	};;
    686 { .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
    687 	(p0)	nop.f		0
    688 	(p51)	add		t[5]=t[5],a4,1	};;
    689 { .mfi;	(p0)	getf.sig	n3=ni2			// 6:
    690 	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
    691 	(p41)	cmp.ltu		p42,p40=a4,n4	}
    692 { .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
    693 	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
    694 	(p0)	nop.i		0		};;
    695 { .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
    696 	(p49)	cmp.ltu		p50,p48=t[5],a4
    697 	(p51)	cmp.leu		p50,p48=t[5],a4	};;
    698 	.pred.rel		"mutex",p40,p42
    699 	.pred.rel		"mutex",p48,p50
    700 { .mii;	(p0)	getf.sig	n4=ni3			// 8:
    701 	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
    702 	(p42)	add		a5=a5,n5,1	};;
    703 { .mii;	(p0)	nop.m		0			// 9:
    704 	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
    705 	(p50)	add		t[4]=t[4],a5,1	};;
    706 { .mii;	(p0)	nop.m		0			// 10:
    707 	(p40)	cmp.ltu		p43,p41=a5,n5
    708 	(p42)	cmp.leu		p43,p41=a5,n5	};;
    709 { .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
    710 	(p48)	cmp.ltu		p51,p49=t[4],a5
    711 	(p50)	cmp.leu		p51,p49=t[4],a5	};;
    712 	.pred.rel		"mutex",p41,p43
    713 	.pred.rel		"mutex",p49,p51
    714 { .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
    715 	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
    716 	(p43)	add		a6=a6,n6,1	};;
    717 { .mii;	(p0)	getf.sig	n5=ni4			// 13:
    718 	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
    719 	(p51)	add		t[3]=t[3],a6,1	};;
    720 { .mii;	(p0)	nop.m		0			// 14:
    721 	(p41)	cmp.ltu		p42,p40=a6,n6
    722 	(p43)	cmp.leu		p42,p40=a6,n6	};;
    723 { .mii;	(p0)	getf.sig	n6=ni5			// 15:
    724 	(p49)	cmp.ltu		p50,p48=t[3],a6
    725 	(p51)	cmp.leu		p50,p48=t[3],a6	};;
    726 	.pred.rel		"mutex",p40,p42
    727 	.pred.rel		"mutex",p48,p50
    728 { .mii;	(p0)	nop.m		0			// 16:
    729 	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
    730 	(p42)	add		a7=a7,n7,1	};;
    731 { .mii;	(p0)	nop.m		0			// 17:
    732 	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
    733 	(p50)	add		t[2]=t[2],a7,1	};;
    734 { .mii;	(p0)	nop.m		0			// 18:
    735 	(p40)	cmp.ltu		p43,p41=a7,n7
    736 	(p42)	cmp.leu		p43,p41=a7,n7	};;
    737 { .mii;	(p0)	getf.sig	n7=ni6			// 19:
    738 	(p48)	cmp.ltu		p51,p49=t[2],a7
    739 	(p50)	cmp.leu		p51,p49=t[2],a7	};;
    740 	.pred.rel		"mutex",p41,p43
    741 	.pred.rel		"mutex",p49,p51
    742 { .mii;	(p0)	nop.m		0			// 20:
    743 	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
    744 	(p43)	add		a8=a8,n8,1	};;
    745 { .mmi;	(p0)	nop.m		0			// 21:
    746 	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
    747 	(p51)	add		t[1]=t[1],a8,1	}
    748 { .mmi;	(p17)	mov		t[0]=r0
    749 	(p41)	cmp.ltu		p42,p40=a8,n8
    750 	(p43)	cmp.leu		p42,p40=a8,n8	};;
    751 { .mmi;	(p0)	getf.sig	n8=ni7			// 22:
    752 	(p49)	cmp.ltu		p50,p48=t[1],a8
    753 	(p51)	cmp.leu		p50,p48=t[1],a8	}
    754 { .mmi;	(p42)	add		t[0]=t[0],r0,1
    755 	(p0)	add		r16=-7*16,prevsp
    756 	(p0)	add		r17=-6*16,prevsp	};;
    757 
    759 // subtract np[8] from carrybit|tmp[8]
    760 // carrybit|tmp[8] layout upon exit from above loop is:
    761 //	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
    762 { .mmi;	(p50)add	t[0]=t[0],r0,1
    763 	add		r18=-5*16,prevsp
    764 	sub		n1=t0,n1	};;
    765 { .mmi;	cmp.gtu		p34,p32=n1,t0;;
    766 	.pred.rel	"mutex",p32,p34
    767 	(p32)sub	n2=t[7],n2
    768 	(p34)sub	n2=t[7],n2,1	};;
    769 { .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
    770 	(p34)cmp.geu	p35,p33=n2,t[7];;
    771 	.pred.rel	"mutex",p33,p35
    772 	(p33)sub	n3=t[6],n3	}
    773 { .mmi;	(p35)sub	n3=t[6],n3,1;;
    774 	(p33)cmp.gtu	p34,p32=n3,t[6]
    775 	(p35)cmp.geu	p34,p32=n3,t[6]	};;
    776 	.pred.rel	"mutex",p32,p34
    777 { .mii;	(p32)sub	n4=t[5],n4
    778 	(p34)sub	n4=t[5],n4,1;;
    779 	(p32)cmp.gtu	p35,p33=n4,t[5]	}
    780 { .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
    781 	.pred.rel	"mutex",p33,p35
    782 	(p33)sub	n5=t[4],n5
    783 	(p35)sub	n5=t[4],n5,1	};;
    784 { .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
    785 	(p35)cmp.geu	p34,p32=n5,t[4];;
    786 	.pred.rel	"mutex",p32,p34
    787 	(p32)sub	n6=t[3],n6	}
    788 { .mmi;	(p34)sub	n6=t[3],n6,1;;
    789 	(p32)cmp.gtu	p35,p33=n6,t[3]
    790 	(p34)cmp.geu	p35,p33=n6,t[3]	};;
    791 	.pred.rel	"mutex",p33,p35
    792 { .mii;	(p33)sub	n7=t[2],n7
    793 	(p35)sub	n7=t[2],n7,1;;
    794 	(p33)cmp.gtu	p34,p32=n7,t[2]	}
    795 { .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
    796 	.pred.rel	"mutex",p32,p34
    797 	(p32)sub	n8=t[1],n8
    798 	(p34)sub	n8=t[1],n8,1	};;
    799 { .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
    800 	(p34)cmp.geu	p35,p33=n8,t[1];;
    801 	.pred.rel	"mutex",p33,p35
    802 	(p33)sub	a8=t[0],r0	}
    803 { .mmi;	(p35)sub	a8=t[0],r0,1;;
    804 	(p33)cmp.gtu	p34,p32=a8,t[0]
    805 	(p35)cmp.geu	p34,p32=a8,t[0]	};;
    806 
    808 // save the result, either tmp[num] or tmp[num]-np[num]
    809 	.pred.rel	"mutex",p32,p34
    810 { .mmi;	(p32)st8	[rptr]=n1,8
    811 	(p34)st8	[rptr]=t0,8
    812 	add		r19=-4*16,prevsp};;
    813 { .mmb;	(p32)st8	[rptr]=n2,8
    814 	(p34)st8	[rptr]=t[7],8
    815 	(p5)br.cond.dpnt.few	.Ldone	};;
    816 { .mmb;	(p32)st8	[rptr]=n3,8
    817 	(p34)st8	[rptr]=t[6],8
    818 	(p7)br.cond.dpnt.few	.Ldone	};;
    819 { .mmb;	(p32)st8	[rptr]=n4,8
    820 	(p34)st8	[rptr]=t[5],8
    821 	(p9)br.cond.dpnt.few	.Ldone	};;
    822 { .mmb;	(p32)st8	[rptr]=n5,8
    823 	(p34)st8	[rptr]=t[4],8
    824 	(p11)br.cond.dpnt.few	.Ldone	};;
    825 { .mmb;	(p32)st8	[rptr]=n6,8
    826 	(p34)st8	[rptr]=t[3],8
    827 	(p13)br.cond.dpnt.few	.Ldone	};;
    828 { .mmb;	(p32)st8	[rptr]=n7,8
    829 	(p34)st8	[rptr]=t[2],8
    830 	(p15)br.cond.dpnt.few	.Ldone	};;
    831 { .mmb;	(p32)st8	[rptr]=n8,8
    832 	(p34)st8	[rptr]=t[1],8
    833 	nop.b		0		};;
    834 .Ldone:						// epilogue
    835 { .mmi;	ldf.fill	f16=[r16],64
    836 	ldf.fill	f17=[r17],64
    837 	nop.i		0		}
    838 { .mmi;	ldf.fill	f18=[r18],64
    839 	ldf.fill	f19=[r19],64
    840 	mov		pr=prevpr,0x1ffff	};;
    841 { .mmi;	ldf.fill	f20=[r16]
    842 	ldf.fill	f21=[r17]
    843 	mov		ar.lc=prevlc	}
    844 { .mmi;	ldf.fill	f22=[r18]
    845 	ldf.fill	f23=[r19]
    846 	mov		ret0=1		}	// signal "handled"
    847 { .mib;	rum		1<<5
    848 	.restore	sp
    849 	mov		sp=prevsp
    850 	br.ret.sptk.many	b0	};;
    851 .endp	bn_mul_mont_8#
    852 
    853 .type	copyright#,\@object
    854 copyright:
    855 stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
    856 ___
    857 
    858 $output=shift and open STDOUT,">$output";
    859 print $code;
    860 close STDOUT;
    861