Home | History | Annotate | Download | only in asm
      1 .explicit
      2 .text
      3 .ident	"ia64.S, Version 2.1"
      4 .ident	"IA-64 ISA artwork by Andy Polyakov <appro (at) fy.chalmers.se>"
      5 
      6 //
      7 // ====================================================================
      8 // Written by Andy Polyakov <appro (at) fy.chalmers.se> for the OpenSSL
      9 // project.
     10 //
     11 // Rights for redistribution and usage in source and binary forms are
     12 // granted according to the OpenSSL license. Warranty of any kind is
     13 // disclaimed.
     14 // ====================================================================
     15 //
     16 // Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
     17 // different from Itanium to this module viewpoint. Most notably, is it
     18 // "wider" than Itanium? Can you experience loop scalability as
     19 // discussed in commentary sections? Not really:-( Itanium2 has 6
     20 // integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
     21 // spin twice as fast, as I need 8 IALU ports. Amount of floating point
     22 // ports is the same, i.e. 2, while I need 4. In other words, to this
     23 // module Itanium2 remains effectively as "wide" as Itanium. Yet it's
     24 // essentially different in respect to this module, and a re-tune was
     25 // required. Well, because some intruction latencies has changed. Most
     26 // noticeably those intensively used:
     27 //
     28 //			Itanium	Itanium2
     29 //	ldf8		9	6		L2 hit
     30 //	ld8		2	1		L1 hit
     31 //	getf		2	5
     32 //	xma[->getf]	7[+1]	4[+0]
     33 //	add[->st8]	1[+1]	1[+0]
     34 //
     35 // What does it mean? You might ratiocinate that the original code
     36 // should run just faster... Because sum of latencies is smaller...
     37 // Wrong! Note that getf latency increased. This means that if a loop is
     38 // scheduled for lower latency (as they were), then it will suffer from
     39 // stall condition and the code will therefore turn anti-scalable, e.g.
     40 // original bn_mul_words spun at 5*n or 2.5 times slower than expected
     41 // on Itanium2! What to do? Reschedule loops for Itanium2? But then
     42 // Itanium would exhibit anti-scalability. So I've chosen to reschedule
     43 // for worst latency for every instruction aiming for best *all-round*
     44 // performance.
     45 
     46 // Q.	How much faster does it get?
     47 // A.	Here is the output from 'openssl speed rsa dsa' for vanilla
     48 //	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
     49 //	Linux 7.1 2.96-81):
     50 //
     51 //	                  sign    verify    sign/s verify/s
     52 //	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
     53 //	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
     54 //	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
     55 //	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
     56 //	                  sign    verify    sign/s verify/s
     57 //	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
     58 //	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
     59 //
     60 //	And here is similar output but for this assembler
     61 //	implementation:-)
     62 //
     63 //	                  sign    verify    sign/s verify/s
     64 //	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
     65 //	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
     66 //	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
     67 //	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
     68 //	                  sign    verify    sign/s verify/s
     69 //	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
     70 //	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
     71 //
     72 //	Yes, you may argue that it's not fair comparison as it's
     73 //	possible to craft the C implementation with BN_UMULT_HIGH
     74 //	inline assembler macro. But of course! Here is the output
     75 //	with the macro:
     76 //
     77 //	                  sign    verify    sign/s verify/s
     78 //	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
     79 //	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
     80 //	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
     81 //	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
     82 //	                  sign    verify    sign/s verify/s
     83 //	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
     84 //	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
     85 //
     86 //	My code is still way faster, huh:-) And I believe that even
     87 //	higher performance can be achieved. Note that as keys get
     88 //	longer, performance gain is larger. Why? According to the
     89 //	profiler there is another player in the field, namely
     90 //	BN_from_montgomery consuming larger and larger portion of CPU
     91 //	time as keysize decreases. I therefore consider putting effort
     92 //	to assembler implementation of the following routine:
     93 //
     94 //	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
     95 //	{
     96 //	int      i,j;
     97 //	BN_ULONG v;
     98 //
     99 //	for (i=0; i<nl; i++)
    100 //		{
    101 //		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
    102 //		nrp++;
    103 //		rp++;
    104 //		if (((nrp[-1]+=v)&BN_MASK2) < v)
    105 //			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
    106 //		}
    107 //	}
    108 //
    109 //	It might as well be beneficial to implement even combaX
    110 //	variants, as it appears as it can literally unleash the
    111 //	performance (see comment section to bn_mul_comba8 below).
    112 //
    113 //	And finally for your reference the output for 0.9.6a compiled
    114 //	with SGIcc version 0.01.0-12 (keep in mind that for the moment
    115 //	of this writing it's not possible to convince SGIcc to use
    116 //	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
    117 //	i.e. for a compiler generated one:-):
    118 //
    119 //	                  sign    verify    sign/s verify/s
    120 //	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
    121 //	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
    122 //	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
    123 //	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
    124 //	                  sign    verify    sign/s verify/s
    125 //	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
    126 //	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
    127 //
    128 //	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
    129 //	system running Redhat Linux 7.1 (very special thanks to Ray
    130 //	McCaffity of Williams Communications for providing an account).
    131 //
    132 // Q.	What's the heck with 'rum 1<<5' at the end of every function?
    133 // A.	Well, by clearing the "upper FP registers written" bit of the
    134 //	User Mask I want to excuse the kernel from preserving upper
    135 //	(f32-f128) FP register bank over process context switch, thus
    136 //	minimizing bus bandwidth consumption during the switch (i.e.
    137 //	after PKI opration completes and the program is off doing
    138 //	something else like bulk symmetric encryption). Having said
    139 //	this, I also want to point out that it might be good idea
    140 //	to compile the whole toolkit (as well as majority of the
    141 //	programs for that matter) with -mfixed-range=f32-f127 command
    142 //	line option. No, it doesn't prevent the compiler from writing
    143 //	to upper bank, but at least discourages to do so. If you don't
    144 //	like the idea you have the option to compile the module with
    145 //	-Drum=nop.m in command line.
    146 //
    147 
    148 #if defined(_HPUX_SOURCE) && !defined(_LP64)
    149 #define	ADDP	addp4
    150 #else
    151 #define	ADDP	add
    152 #endif
    153 
    154 #if 1
    155 //
    156 // bn_[add|sub]_words routines.
    157 //
    158 // Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
    159 // data reside in L1 cache, i.e. 2 ticks away). It's possible to
    160 // compress the epilogue and get down to 2*n+6, but at the cost of
    161 // scalability (the neat feature of this implementation is that it
    162 // shall automagically spin in n+5 on "wider" IA-64 implementations:-)
    163 // I consider that the epilogue is short enough as it is to trade tiny
    164 // performance loss on Itanium for scalability.
    165 //
    166 // BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
    167 //
    168 .global	bn_add_words#
    169 .proc	bn_add_words#
    170 .align	64
    171 .skip	32	// makes the loop body aligned at 64-byte boundary
    172 bn_add_words:
    173 	.prologue
    174 	.save	ar.pfs,r2
    175 { .mii;	alloc		r2=ar.pfs,4,12,0,16
    176 	cmp4.le		p6,p0=r35,r0	};;
    177 { .mfb;	mov		r8=r0			// return value
    178 (p6)	br.ret.spnt.many	b0	};;
    179 
    180 { .mib;	sub		r10=r35,r0,1
    181 	.save	ar.lc,r3
    182 	mov		r3=ar.lc
    183 	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
    184 					}
    185 { .mib;	ADDP		r14=0,r32		// rp
    186 	.save	pr,r9
    187 	mov		r9=pr		};;
    188 	.body
    189 { .mii;	ADDP		r15=0,r33		// ap
    190 	mov		ar.lc=r10
    191 	mov		ar.ec=6		}
    192 { .mib;	ADDP		r16=0,r34		// bp
    193 	mov		pr.rot=1<<16	};;
    194 
    195 .L_bn_add_words_ctop:
    196 { .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
    197 	(p18)	add		r39=r37,r34
    198 	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
    199 { .mfb;	(p0)	nop.m		0x0
    200 	(p0)	nop.f		0x0
    201 	(p0)	nop.b		0x0		}
    202 { .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
    203 	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
    204 	(p58)	add		r41=1,r41	} // (p20)
    205 { .mfb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
    206 	(p0)	nop.f		0x0
    207 	br.ctop.sptk	.L_bn_add_words_ctop	};;
    208 .L_bn_add_words_cend:
    209 
    210 { .mii;
    211 (p59)	add		r8=1,r8		// return value
    212 	mov		pr=r9,0x1ffff
    213 	mov		ar.lc=r3	}
    214 { .mbb;	nop.b		0x0
    215 	br.ret.sptk.many	b0	};;
    216 .endp	bn_add_words#
    217 
    218 //
    219 // BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
    220 //
    221 .global	bn_sub_words#
    222 .proc	bn_sub_words#
    223 .align	64
    224 .skip	32	// makes the loop body aligned at 64-byte boundary
    225 bn_sub_words:
    226 	.prologue
    227 	.save	ar.pfs,r2
    228 { .mii;	alloc		r2=ar.pfs,4,12,0,16
    229 	cmp4.le		p6,p0=r35,r0	};;
    230 { .mfb;	mov		r8=r0			// return value
    231 (p6)	br.ret.spnt.many	b0	};;
    232 
    233 { .mib;	sub		r10=r35,r0,1
    234 	.save	ar.lc,r3
    235 	mov		r3=ar.lc
    236 	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
    237 					}
    238 { .mib;	ADDP		r14=0,r32		// rp
    239 	.save	pr,r9
    240 	mov		r9=pr		};;
    241 	.body
    242 { .mii;	ADDP		r15=0,r33		// ap
    243 	mov		ar.lc=r10
    244 	mov		ar.ec=6		}
    245 { .mib;	ADDP		r16=0,r34		// bp
    246 	mov		pr.rot=1<<16	};;
    247 
    248 .L_bn_sub_words_ctop:
    249 { .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
    250 	(p18)	sub		r39=r37,r34
    251 	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
    252 { .mfb;	(p0)	nop.m		0x0
    253 	(p0)	nop.f		0x0
    254 	(p0)	nop.b		0x0		}
    255 { .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
    256 	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
    257 	(p58)	add		r41=-1,r41	} // (p20)
    258 { .mbb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
    259 	(p0)	nop.b		0x0
    260 	br.ctop.sptk	.L_bn_sub_words_ctop	};;
    261 .L_bn_sub_words_cend:
    262 
    263 { .mii;
    264 (p59)	add		r8=1,r8		// return value
    265 	mov		pr=r9,0x1ffff
    266 	mov		ar.lc=r3	}
    267 { .mbb;	nop.b		0x0
    268 	br.ret.sptk.many	b0	};;
    269 .endp	bn_sub_words#
    270 #endif
    271 
    272 #if 0
    273 #define XMA_TEMPTATION
    274 #endif
    275 
    276 #if 1
    277 //
    278 // BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
    279 //
    280 .global	bn_mul_words#
    281 .proc	bn_mul_words#
    282 .align	64
    283 .skip	32	// makes the loop body aligned at 64-byte boundary
    284 bn_mul_words:
    285 	.prologue
    286 	.save	ar.pfs,r2
    287 #ifdef XMA_TEMPTATION
    288 { .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
    289 #else
    290 { .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;
    291 #endif
    292 { .mib;	mov		r8=r0			// return value
    293 	cmp4.le		p6,p0=r34,r0
    294 (p6)	br.ret.spnt.many	b0		};;
    295 
    296 { .mii;	sub	r10=r34,r0,1
    297 	.save	ar.lc,r3
    298 	mov	r3=ar.lc
    299 	.save	pr,r9
    300 	mov	r9=pr			};;
    301 
    302 	.body
    303 { .mib;	setf.sig	f8=r35	// w
    304 	mov		pr.rot=0x800001<<16
    305 			// ------^----- serves as (p50) at first (p27)
    306 	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
    307 					}
    308 
    309 #ifndef XMA_TEMPTATION
    310 
    311 { .mmi;	ADDP		r14=0,r32	// rp
    312 	ADDP		r15=0,r33	// ap
    313 	mov		ar.lc=r10	}
    314 { .mmi;	mov		r40=0		// serves as r35 at first (p27)
    315 	mov		ar.ec=13	};;
    316 
    317 // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
    318 // L2 cache (i.e. 9 ticks away) as floating point load/store instructions
    319 // bypass L1 cache and L2 latency is actually best-case scenario for
    320 // ldf8. The loop is not scalable and shall run in 2*(n+12) even on
    321 // "wider" IA-64 implementations. It's a trade-off here. n+24 loop
    322 // would give us ~5% in *overall* performance improvement on "wider"
    323 // IA-64, but would hurt Itanium for about same because of longer
    324 // epilogue. As it's a matter of few percents in either case I've
    325 // chosen to trade the scalability for development time (you can see
    326 // this very instruction sequence in bn_mul_add_words loop which in
    327 // turn is scalable).
    328 .L_bn_mul_words_ctop:
    329 { .mfi;	(p25)	getf.sig	r36=f52			// low
    330 	(p21)	xmpy.lu		f48=f37,f8
    331 	(p28)	cmp.ltu		p54,p50=r41,r39	}
    332 { .mfi;	(p16)	ldf8		f32=[r15],8
    333 	(p21)	xmpy.hu		f40=f37,f8
    334 	(p0)	nop.i		0x0		};;
    335 { .mii;	(p25)	getf.sig	r32=f44			// high
    336 	.pred.rel	"mutex",p50,p54
    337 	(p50)	add		r40=r38,r35		// (p27)
    338 	(p54)	add		r40=r38,r35,1	}	// (p27)
    339 { .mfb;	(p28)	st8		[r14]=r41,8
    340 	(p0)	nop.f		0x0
    341 	br.ctop.sptk	.L_bn_mul_words_ctop	};;
    342 .L_bn_mul_words_cend:
    343 
    344 { .mii;	nop.m		0x0
    345 .pred.rel	"mutex",p51,p55
    346 (p51)	add		r8=r36,r0
    347 (p55)	add		r8=r36,r0,1	}
    348 { .mfb;	nop.m	0x0
    349 	nop.f	0x0
    350 	nop.b	0x0			}
    351 
    352 #else	// XMA_TEMPTATION
    353 
    354 	setf.sig	f37=r0	// serves as carry at (p18) tick
    355 	mov		ar.lc=r10
    356 	mov		ar.ec=5;;
    357 
    358 // Most of you examining this code very likely wonder why in the name
    359 // of Intel the following loop is commented out? Indeed, it looks so
    360 // neat that you find it hard to believe that it's something wrong
    361 // with it, right? The catch is that every iteration depends on the
    362 // result from previous one and the latter isn't available instantly.
    363 // The loop therefore spins at the latency of xma minus 1, or in other
    364 // words at 6*(n+4) ticks:-( Compare to the "production" loop above
    365 // that runs in 2*(n+11) where the low latency problem is worked around
    366 // by moving the dependency to one-tick latent interger ALU. Note that
    367 // "distance" between ldf8 and xma is not latency of ldf8, but the
    368 // *difference* between xma and ldf8 latencies.
    369 .L_bn_mul_words_ctop:
    370 { .mfi;	(p16)	ldf8		f32=[r33],8
    371 	(p18)	xma.hu		f38=f34,f8,f39	}
    372 { .mfb;	(p20)	stf8		[r32]=f37,8
    373 	(p18)	xma.lu		f35=f34,f8,f39
    374 	br.ctop.sptk	.L_bn_mul_words_ctop	};;
    375 .L_bn_mul_words_cend:
    376 
    377 	getf.sig	r8=f41		// the return value
    378 
    379 #endif	// XMA_TEMPTATION
    380 
    381 { .mii;	nop.m		0x0
    382 	mov		pr=r9,0x1ffff
    383 	mov		ar.lc=r3	}
    384 { .mfb;	rum		1<<5		// clear um.mfh
    385 	nop.f		0x0
    386 	br.ret.sptk.many	b0	};;
    387 .endp	bn_mul_words#
    388 #endif
    389 
    390 #if 1
    391 //
    392 // BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
    393 //
    394 .global	bn_mul_add_words#
    395 .proc	bn_mul_add_words#
    396 .align	64
    397 .skip	48	// makes the loop body aligned at 64-byte boundary
    398 bn_mul_add_words:
    399 	.prologue
    400 	.save	ar.pfs,r2
    401 { .mmi;	alloc		r2=ar.pfs,4,4,0,8
    402 	cmp4.le		p6,p0=r34,r0
    403 	.save	ar.lc,r3
    404 	mov		r3=ar.lc	};;
    405 { .mib;	mov		r8=r0		// return value
    406 	sub		r10=r34,r0,1
    407 (p6)	br.ret.spnt.many	b0	};;
    408 
    409 { .mib;	setf.sig	f8=r35		// w
    410 	.save	pr,r9
    411 	mov		r9=pr
    412 	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
    413 					}
    414 	.body
    415 { .mmi;	ADDP		r14=0,r32	// rp
    416 	ADDP		r15=0,r33	// ap
    417 	mov		ar.lc=r10	}
    418 { .mii;	ADDP		r16=0,r32	// rp copy
    419 	mov		pr.rot=0x2001<<16
    420 			// ------^----- serves as (p40) at first (p27)
    421 	mov		ar.ec=11	};;
    422 
    423 // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
    424 // Itanium 2. Yes, unlike previous versions it scales:-) Previous
    425 // version was peforming *all* additions in IALU and was starving
    426 // for those even on Itanium 2. In this version one addition is
    427 // moved to FPU and is folded with multiplication. This is at cost
    428 // of propogating the result from previous call to this subroutine
    429 // to L2 cache... In other words negligible even for shorter keys.
    430 // *Overall* performance improvement [over previous version] varies
    431 // from 11 to 22 percent depending on key length.
    432 .L_bn_mul_add_words_ctop:
    433 .pred.rel	"mutex",p40,p42
    434 { .mfi;	(p23)	getf.sig	r36=f45			// low
    435 	(p20)	xma.lu		f42=f36,f8,f50		// low
    436 	(p40)	add		r39=r39,r35	}	// (p27)
    437 { .mfi;	(p16)	ldf8		f32=[r15],8		// *(ap++)
    438 	(p20)	xma.hu		f36=f36,f8,f50		// high
    439 	(p42)	add		r39=r39,r35,1	};;	// (p27)
    440 { .mmi;	(p24)	getf.sig	r32=f40			// high
    441 	(p16)	ldf8		f46=[r16],8		// *(rp1++)
    442 	(p40)	cmp.ltu		p41,p39=r39,r35	}	// (p27)
    443 { .mib;	(p26)	st8		[r14]=r39,8		// *(rp2++)
    444 	(p42)	cmp.leu		p41,p39=r39,r35		// (p27)
    445 	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
    446 .L_bn_mul_add_words_cend:
    447 
    448 { .mmi;	.pred.rel	"mutex",p40,p42
    449 (p40)	add		r8=r35,r0
    450 (p42)	add		r8=r35,r0,1
    451 	mov		pr=r9,0x1ffff	}
    452 { .mib;	rum		1<<5		// clear um.mfh
    453 	mov		ar.lc=r3
    454 	br.ret.sptk.many	b0	};;
    455 .endp	bn_mul_add_words#
    456 #endif
    457 
    458 #if 1
    459 //
    460 // void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
    461 //
    462 .global	bn_sqr_words#
    463 .proc	bn_sqr_words#
    464 .align	64
    465 .skip	32	// makes the loop body aligned at 64-byte boundary
    466 bn_sqr_words:
    467 	.prologue
    468 	.save	ar.pfs,r2
    469 { .mii;	alloc		r2=ar.pfs,3,0,0,0
    470 	sxt4		r34=r34		};;
    471 { .mii;	cmp.le		p6,p0=r34,r0
    472 	mov		r8=r0		}	// return value
    473 { .mfb;	ADDP		r32=0,r32
    474 	nop.f		0x0
    475 (p6)	br.ret.spnt.many	b0	};;
    476 
    477 { .mii;	sub	r10=r34,r0,1
    478 	.save	ar.lc,r3
    479 	mov	r3=ar.lc
    480 	.save	pr,r9
    481 	mov	r9=pr			};;
    482 
    483 	.body
    484 { .mib;	ADDP		r33=0,r33
    485 	mov		pr.rot=1<<16
    486 	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
    487 					}
    488 { .mii;	add		r34=8,r32
    489 	mov		ar.lc=r10
    490 	mov		ar.ec=18	};;
    491 
    492 // 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
    493 // possible to compress the epilogue (I'm getting tired to write this
    494 // comment over and over) and get down to 2*n+16 at the cost of
    495 // scalability. The decision will very likely be reconsidered after the
    496 // benchmark program is profiled. I.e. if perfomance gain on Itanium
    497 // will appear larger than loss on "wider" IA-64, then the loop should
    498 // be explicitely split and the epilogue compressed.
    499 .L_bn_sqr_words_ctop:
    500 { .mfi;	(p16)	ldf8		f32=[r33],8
    501 	(p25)	xmpy.lu		f42=f41,f41
    502 	(p0)	nop.i		0x0		}
    503 { .mib;	(p33)	stf8		[r32]=f50,16
    504 	(p0)	nop.i		0x0
    505 	(p0)	nop.b		0x0		}
    506 { .mfi;	(p0)	nop.m		0x0
    507 	(p25)	xmpy.hu		f52=f41,f41
    508 	(p0)	nop.i		0x0		}
    509 { .mib;	(p33)	stf8		[r34]=f60,16
    510 	(p0)	nop.i		0x0
    511 	br.ctop.sptk	.L_bn_sqr_words_ctop	};;
    512 .L_bn_sqr_words_cend:
    513 
    514 { .mii;	nop.m		0x0
    515 	mov		pr=r9,0x1ffff
    516 	mov		ar.lc=r3	}
    517 { .mfb;	rum		1<<5		// clear um.mfh
    518 	nop.f		0x0
    519 	br.ret.sptk.many	b0	};;
    520 .endp	bn_sqr_words#
    521 #endif
    522 
    523 #if 1
    524 // Apparently we win nothing by implementing special bn_sqr_comba8.
    525 // Yes, it is possible to reduce the number of multiplications by
    526 // almost factor of two, but then the amount of additions would
    527 // increase by factor of two (as we would have to perform those
    528 // otherwise performed by xma ourselves). Normally we would trade
    529 // anyway as multiplications are way more expensive, but not this
    530 // time... Multiplication kernel is fully pipelined and as we drain
    531 // one 128-bit multiplication result per clock cycle multiplications
    532 // are effectively as inexpensive as additions. Special implementation
    533 // might become of interest for "wider" IA-64 implementation as you'll
    534 // be able to get through the multiplication phase faster (there won't
    535 // be any stall issues as discussed in the commentary section below and
    536 // you therefore will be able to employ all 4 FP units)... But these
    537 // Itanium days it's simply too hard to justify the effort so I just
    538 // drop down to bn_mul_comba8 code:-)
    539 //
    540 // void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
    541 //
    542 .global	bn_sqr_comba8#
    543 .proc	bn_sqr_comba8#
    544 .align	64
    545 bn_sqr_comba8:
    546 	.prologue
    547 	.save	ar.pfs,r2
    548 #if defined(_HPUX_SOURCE) && !defined(_LP64)
    549 { .mii;	alloc	r2=ar.pfs,2,1,0,0
    550 	addp4	r33=0,r33
    551 	addp4	r32=0,r32		};;
    552 { .mii;
    553 #else
    554 { .mii;	alloc	r2=ar.pfs,2,1,0,0
    555 #endif
    556 	mov	r34=r33
    557 	add	r14=8,r33		};;
    558 	.body
    559 { .mii;	add	r17=8,r34
    560 	add	r15=16,r33
    561 	add	r18=16,r34		}
    562 { .mfb;	add	r16=24,r33
    563 	br	.L_cheat_entry_point8	};;
    564 .endp	bn_sqr_comba8#
    565 #endif
    566 
    567 #if 1
    568 // I've estimated this routine to run in ~120 ticks, but in reality
    569 // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
    570 // cycles consumed for instructions fetch? Or did I misinterpret some
    571 // clause in Itanium -architecture manual? Comments are welcomed and
    572 // highly appreciated.
    573 //
    574 // On Itanium 2 it takes ~190 ticks. This is because of stalls on
    575 // result from getf.sig. I do nothing about it at this point for
    576 // reasons depicted below.
    577 //
    578 // However! It should be noted that even 160 ticks is darn good result
    579 // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
    580 // C version (compiled with gcc with inline assembler). I really
    581 // kicked compiler's butt here, didn't I? Yeah! This brings us to the
    582 // following statement. It's damn shame that this routine isn't called
    583 // very often nowadays! According to the profiler most CPU time is
    584 // consumed by bn_mul_add_words called from BN_from_montgomery. In
    585 // order to estimate what we're missing, I've compared the performance
    586 // of this routine against "traditional" implementation, i.e. against
    587 // following routine:
    588 //
    589 // void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    590 // {	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
    591 //	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
    592 //	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
    593 //	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
    594 //	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
    595 //	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
    596 //	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
    597 //	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
    598 // }
    599 //
    600 // The one below is over 8 times faster than the one above:-( Even
    601 // more reasons to "combafy" bn_mul_add_mont...
    602 //
    603 // And yes, this routine really made me wish there were an optimizing
    604 // assembler! It also feels like it deserves a dedication.
    605 //
    606 //	To my wife for being there and to my kids...
    607 //
    608 // void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    609 //
    610 #define	carry1	r14
    611 #define	carry2	r15
    612 #define	carry3	r34
    613 .global	bn_mul_comba8#
    614 .proc	bn_mul_comba8#
    615 .align	64
    616 bn_mul_comba8:
    617 	.prologue
    618 	.save	ar.pfs,r2
    619 #if defined(_HPUX_SOURCE) && !defined(_LP64)
    620 { .mii;	alloc	r2=ar.pfs,3,0,0,0
    621 	addp4	r33=0,r33
    622 	addp4	r34=0,r34		};;
    623 { .mii;	addp4	r32=0,r32
    624 #else
    625 { .mii;	alloc   r2=ar.pfs,3,0,0,0
    626 #endif
    627 	add	r14=8,r33
    628 	add	r17=8,r34		}
    629 	.body
    630 { .mii;	add	r15=16,r33
    631 	add	r18=16,r34
    632 	add	r16=24,r33		}
    633 .L_cheat_entry_point8:
    634 { .mmi;	add	r19=24,r34
    635 
    636 	ldf8	f32=[r33],32		};;
    637 
    638 { .mmi;	ldf8	f120=[r34],32
    639 	ldf8	f121=[r17],32		}
    640 { .mmi;	ldf8	f122=[r18],32
    641 	ldf8	f123=[r19],32		};;
    642 { .mmi;	ldf8	f124=[r34]
    643 	ldf8	f125=[r17]		}
    644 { .mmi;	ldf8	f126=[r18]
    645 	ldf8	f127=[r19]		}
    646 
    647 { .mmi;	ldf8	f33=[r14],32
    648 	ldf8	f34=[r15],32		}
    649 { .mmi;	ldf8	f35=[r16],32;;
    650 	ldf8	f36=[r33]		}
    651 { .mmi;	ldf8	f37=[r14]
    652 	ldf8	f38=[r15]		}
    653 { .mfi;	ldf8	f39=[r16]
    654 // -------\ Entering multiplier's heaven /-------
    655 // ------------\                    /------------
    656 // -----------------\          /-----------------
    657 // ----------------------\/----------------------
    658 		xma.hu	f41=f32,f120,f0		}
    659 { .mfi;		xma.lu	f40=f32,f120,f0		};; // (*)
    660 { .mfi;		xma.hu	f51=f32,f121,f0		}
    661 { .mfi;		xma.lu	f50=f32,f121,f0		};;
    662 { .mfi;		xma.hu	f61=f32,f122,f0		}
    663 { .mfi;		xma.lu	f60=f32,f122,f0		};;
    664 { .mfi;		xma.hu	f71=f32,f123,f0		}
    665 { .mfi;		xma.lu	f70=f32,f123,f0		};;
    666 { .mfi;		xma.hu	f81=f32,f124,f0		}
    667 { .mfi;		xma.lu	f80=f32,f124,f0		};;
    668 { .mfi;		xma.hu	f91=f32,f125,f0		}
    669 { .mfi;		xma.lu	f90=f32,f125,f0		};;
    670 { .mfi;		xma.hu	f101=f32,f126,f0	}
    671 { .mfi;		xma.lu	f100=f32,f126,f0	};;
    672 { .mfi;		xma.hu	f111=f32,f127,f0	}
    673 { .mfi;		xma.lu	f110=f32,f127,f0	};;//
    674 // (*)	You can argue that splitting at every second bundle would
    675 //	prevent "wider" IA-64 implementations from achieving the peak
    676 //	performance. Well, not really... The catch is that if you
    677 //	intend to keep 4 FP units busy by splitting at every fourth
    678 //	bundle and thus perform these 16 multiplications in 4 ticks,
    679 //	the first bundle *below* would stall because the result from
    680 //	the first xma bundle *above* won't be available for another 3
    681 //	ticks (if not more, being an optimist, I assume that "wider"
    682 //	implementation will have same latency:-). This stall will hold
    683 //	you back and the performance would be as if every second bundle
    684 //	were split *anyway*...
    685 { .mfi;	getf.sig	r16=f40
    686 		xma.hu	f42=f33,f120,f41
    687 	add		r33=8,r32		}
    688 { .mfi;		xma.lu	f41=f33,f120,f41	};;
    689 { .mfi;	getf.sig	r24=f50
    690 		xma.hu	f52=f33,f121,f51	}
    691 { .mfi;		xma.lu	f51=f33,f121,f51	};;
    692 { .mfi;	st8		[r32]=r16,16
    693 		xma.hu	f62=f33,f122,f61	}
    694 { .mfi;		xma.lu	f61=f33,f122,f61	};;
    695 { .mfi;		xma.hu	f72=f33,f123,f71	}
    696 { .mfi;		xma.lu	f71=f33,f123,f71	};;
    697 { .mfi;		xma.hu	f82=f33,f124,f81	}
    698 { .mfi;		xma.lu	f81=f33,f124,f81	};;
    699 { .mfi;		xma.hu	f92=f33,f125,f91	}
    700 { .mfi;		xma.lu	f91=f33,f125,f91	};;
    701 { .mfi;		xma.hu	f102=f33,f126,f101	}
    702 { .mfi;		xma.lu	f101=f33,f126,f101	};;
    703 { .mfi;		xma.hu	f112=f33,f127,f111	}
    704 { .mfi;		xma.lu	f111=f33,f127,f111	};;//
    705 //-------------------------------------------------//
    706 { .mfi;	getf.sig	r25=f41
    707 		xma.hu	f43=f34,f120,f42	}
    708 { .mfi;		xma.lu	f42=f34,f120,f42	};;
    709 { .mfi;	getf.sig	r16=f60
    710 		xma.hu	f53=f34,f121,f52	}
    711 { .mfi;		xma.lu	f52=f34,f121,f52	};;
    712 { .mfi;	getf.sig	r17=f51
    713 		xma.hu	f63=f34,f122,f62
    714 	add		r25=r25,r24		}
    715 { .mfi;		xma.lu	f62=f34,f122,f62
    716 	mov		carry1=0		};;
    717 { .mfi;	cmp.ltu		p6,p0=r25,r24
    718 		xma.hu	f73=f34,f123,f72	}
    719 { .mfi;		xma.lu	f72=f34,f123,f72	};;
    720 { .mfi;	st8		[r33]=r25,16
    721 		xma.hu	f83=f34,f124,f82
    722 (p6)	add		carry1=1,carry1		}
    723 { .mfi;		xma.lu	f82=f34,f124,f82	};;
    724 { .mfi;		xma.hu	f93=f34,f125,f92	}
    725 { .mfi;		xma.lu	f92=f34,f125,f92	};;
    726 { .mfi;		xma.hu	f103=f34,f126,f102	}
    727 { .mfi;		xma.lu	f102=f34,f126,f102	};;
    728 { .mfi;		xma.hu	f113=f34,f127,f112	}
    729 { .mfi;		xma.lu	f112=f34,f127,f112	};;//
    730 //-------------------------------------------------//
    731 { .mfi;	getf.sig	r18=f42
    732 		xma.hu	f44=f35,f120,f43
    733 	add		r17=r17,r16		}
    734 { .mfi;		xma.lu	f43=f35,f120,f43	};;
    735 { .mfi;	getf.sig	r24=f70
    736 		xma.hu	f54=f35,f121,f53	}
    737 { .mfi;	mov		carry2=0
    738 		xma.lu	f53=f35,f121,f53	};;
    739 { .mfi;	getf.sig	r25=f61
    740 		xma.hu	f64=f35,f122,f63
    741 	cmp.ltu		p7,p0=r17,r16		}
    742 { .mfi;	add		r18=r18,r17
    743 		xma.lu	f63=f35,f122,f63	};;
    744 { .mfi;	getf.sig	r26=f52
    745 		xma.hu	f74=f35,f123,f73
    746 (p7)	add		carry2=1,carry2		}
    747 { .mfi;	cmp.ltu		p7,p0=r18,r17
    748 		xma.lu	f73=f35,f123,f73
    749 	add		r18=r18,carry1		};;
    750 { .mfi;
    751 		xma.hu	f84=f35,f124,f83
    752 (p7)	add		carry2=1,carry2		}
    753 { .mfi;	cmp.ltu		p7,p0=r18,carry1
    754 		xma.lu	f83=f35,f124,f83	};;
    755 { .mfi;	st8		[r32]=r18,16
    756 		xma.hu	f94=f35,f125,f93
    757 (p7)	add		carry2=1,carry2		}
    758 { .mfi;		xma.lu	f93=f35,f125,f93	};;
    759 { .mfi;		xma.hu	f104=f35,f126,f103	}
    760 { .mfi;		xma.lu	f103=f35,f126,f103	};;
    761 { .mfi;		xma.hu	f114=f35,f127,f113	}
    762 { .mfi;	mov		carry1=0
    763 		xma.lu	f113=f35,f127,f113
    764 	add		r25=r25,r24		};;//
    765 //-------------------------------------------------//
    766 { .mfi;	getf.sig	r27=f43
    767 		xma.hu	f45=f36,f120,f44
    768 	cmp.ltu		p6,p0=r25,r24		}
    769 { .mfi;		xma.lu	f44=f36,f120,f44
    770 	add		r26=r26,r25		};;
    771 { .mfi;	getf.sig	r16=f80
    772 		xma.hu	f55=f36,f121,f54
    773 (p6)	add		carry1=1,carry1		}
    774 { .mfi;		xma.lu	f54=f36,f121,f54	};;
    775 { .mfi;	getf.sig	r17=f71
    776 		xma.hu	f65=f36,f122,f64
    777 	cmp.ltu		p6,p0=r26,r25		}
    778 { .mfi;		xma.lu	f64=f36,f122,f64
    779 	add		r27=r27,r26		};;
    780 { .mfi;	getf.sig	r18=f62
    781 		xma.hu	f75=f36,f123,f74
    782 (p6)	add		carry1=1,carry1		}
    783 { .mfi;	cmp.ltu		p6,p0=r27,r26
    784 		xma.lu	f74=f36,f123,f74
    785 	add		r27=r27,carry2		};;
    786 { .mfi;	getf.sig	r19=f53
    787 		xma.hu	f85=f36,f124,f84
    788 (p6)	add		carry1=1,carry1		}
    789 { .mfi;		xma.lu	f84=f36,f124,f84
    790 	cmp.ltu		p6,p0=r27,carry2	};;
    791 { .mfi;	st8		[r33]=r27,16
    792 		xma.hu	f95=f36,f125,f94
    793 (p6)	add		carry1=1,carry1		}
    794 { .mfi;		xma.lu	f94=f36,f125,f94	};;
    795 { .mfi;		xma.hu	f105=f36,f126,f104	}
    796 { .mfi;	mov		carry2=0
    797 		xma.lu	f104=f36,f126,f104
    798 	add		r17=r17,r16		};;
    799 { .mfi;		xma.hu	f115=f36,f127,f114
    800 	cmp.ltu		p7,p0=r17,r16		}
    801 { .mfi;		xma.lu	f114=f36,f127,f114
    802 	add		r18=r18,r17		};;//
    803 //-------------------------------------------------//
    804 { .mfi;	getf.sig	r20=f44
    805 		xma.hu	f46=f37,f120,f45
    806 (p7)	add		carry2=1,carry2		}
    807 { .mfi;	cmp.ltu		p7,p0=r18,r17
    808 		xma.lu	f45=f37,f120,f45
    809 	add		r19=r19,r18		};;
    810 { .mfi;	getf.sig	r24=f90
    811 		xma.hu	f56=f37,f121,f55	}
    812 { .mfi;		xma.lu	f55=f37,f121,f55	};;
    813 { .mfi;	getf.sig	r25=f81
    814 		xma.hu	f66=f37,f122,f65
    815 (p7)	add		carry2=1,carry2		}
    816 { .mfi;	cmp.ltu		p7,p0=r19,r18
    817 		xma.lu	f65=f37,f122,f65
    818 	add		r20=r20,r19		};;
    819 { .mfi;	getf.sig	r26=f72
    820 		xma.hu	f76=f37,f123,f75
    821 (p7)	add		carry2=1,carry2		}
    822 { .mfi;	cmp.ltu		p7,p0=r20,r19
    823 		xma.lu	f75=f37,f123,f75
    824 	add		r20=r20,carry1		};;
    825 { .mfi;	getf.sig	r27=f63
    826 		xma.hu	f86=f37,f124,f85
    827 (p7)	add		carry2=1,carry2		}
    828 { .mfi;		xma.lu	f85=f37,f124,f85
    829 	cmp.ltu		p7,p0=r20,carry1	};;
    830 { .mfi;	getf.sig	r28=f54
    831 		xma.hu	f96=f37,f125,f95
    832 (p7)	add		carry2=1,carry2		}
    833 { .mfi;	st8		[r32]=r20,16
    834 		xma.lu	f95=f37,f125,f95	};;
    835 { .mfi;		xma.hu	f106=f37,f126,f105	}
    836 { .mfi;	mov		carry1=0
    837 		xma.lu	f105=f37,f126,f105
    838 	add		r25=r25,r24		};;
    839 { .mfi;		xma.hu	f116=f37,f127,f115
    840 	cmp.ltu		p6,p0=r25,r24		}
    841 { .mfi;		xma.lu	f115=f37,f127,f115
    842 	add		r26=r26,r25		};;//
    843 //-------------------------------------------------//
    844 { .mfi;	getf.sig	r29=f45
    845 		xma.hu	f47=f38,f120,f46
    846 (p6)	add		carry1=1,carry1		}
    847 { .mfi;	cmp.ltu		p6,p0=r26,r25
    848 		xma.lu	f46=f38,f120,f46
    849 	add		r27=r27,r26		};;
    850 { .mfi;	getf.sig	r16=f100
    851 		xma.hu	f57=f38,f121,f56
    852 (p6)	add		carry1=1,carry1		}
    853 { .mfi;	cmp.ltu		p6,p0=r27,r26
    854 		xma.lu	f56=f38,f121,f56
    855 	add		r28=r28,r27		};;
    856 { .mfi;	getf.sig	r17=f91
    857 		xma.hu	f67=f38,f122,f66
    858 (p6)	add		carry1=1,carry1		}
    859 { .mfi;	cmp.ltu		p6,p0=r28,r27
    860 		xma.lu	f66=f38,f122,f66
    861 	add		r29=r29,r28		};;
    862 { .mfi;	getf.sig	r18=f82
    863 		xma.hu	f77=f38,f123,f76
    864 (p6)	add		carry1=1,carry1		}
    865 { .mfi;	cmp.ltu		p6,p0=r29,r28
    866 		xma.lu	f76=f38,f123,f76
    867 	add		r29=r29,carry2		};;
    868 { .mfi;	getf.sig	r19=f73
    869 		xma.hu	f87=f38,f124,f86
    870 (p6)	add		carry1=1,carry1		}
    871 { .mfi;		xma.lu	f86=f38,f124,f86
    872 	cmp.ltu		p6,p0=r29,carry2	};;
    873 { .mfi;	getf.sig	r20=f64
    874 		xma.hu	f97=f38,f125,f96
    875 (p6)	add		carry1=1,carry1		}
    876 { .mfi;	st8		[r33]=r29,16
    877 		xma.lu	f96=f38,f125,f96	};;
    878 { .mfi;	getf.sig	r21=f55
    879 		xma.hu	f107=f38,f126,f106	}
    880 { .mfi;	mov		carry2=0
    881 		xma.lu	f106=f38,f126,f106
    882 	add		r17=r17,r16		};;
    883 { .mfi;		xma.hu	f117=f38,f127,f116
    884 	cmp.ltu		p7,p0=r17,r16		}
    885 { .mfi;		xma.lu	f116=f38,f127,f116
    886 	add		r18=r18,r17		};;//
    887 //-------------------------------------------------//
    888 { .mfi;	getf.sig	r22=f46
    889 		xma.hu	f48=f39,f120,f47
    890 (p7)	add		carry2=1,carry2		}
    891 { .mfi;	cmp.ltu		p7,p0=r18,r17
    892 		xma.lu	f47=f39,f120,f47
    893 	add		r19=r19,r18		};;
    894 { .mfi;	getf.sig	r24=f110
    895 		xma.hu	f58=f39,f121,f57
    896 (p7)	add		carry2=1,carry2		}
    897 { .mfi;	cmp.ltu		p7,p0=r19,r18
    898 		xma.lu	f57=f39,f121,f57
    899 	add		r20=r20,r19		};;
    900 { .mfi;	getf.sig	r25=f101
    901 		xma.hu	f68=f39,f122,f67
    902 (p7)	add		carry2=1,carry2		}
    903 { .mfi;	cmp.ltu		p7,p0=r20,r19
    904 		xma.lu	f67=f39,f122,f67
    905 	add		r21=r21,r20		};;
    906 { .mfi;	getf.sig	r26=f92
    907 		xma.hu	f78=f39,f123,f77
    908 (p7)	add		carry2=1,carry2		}
    909 { .mfi;	cmp.ltu		p7,p0=r21,r20
    910 		xma.lu	f77=f39,f123,f77
    911 	add		r22=r22,r21		};;
    912 { .mfi;	getf.sig	r27=f83
    913 		xma.hu	f88=f39,f124,f87
    914 (p7)	add		carry2=1,carry2		}
    915 { .mfi;	cmp.ltu		p7,p0=r22,r21
    916 		xma.lu	f87=f39,f124,f87
    917 	add		r22=r22,carry1		};;
    918 { .mfi;	getf.sig	r28=f74
    919 		xma.hu	f98=f39,f125,f97
    920 (p7)	add		carry2=1,carry2		}
    921 { .mfi;		xma.lu	f97=f39,f125,f97
    922 	cmp.ltu		p7,p0=r22,carry1	};;
    923 { .mfi;	getf.sig	r29=f65
    924 		xma.hu	f108=f39,f126,f107
    925 (p7)	add		carry2=1,carry2		}
    926 { .mfi;	st8		[r32]=r22,16
    927 		xma.lu	f107=f39,f126,f107	};;
    928 { .mfi;	getf.sig	r30=f56
    929 		xma.hu	f118=f39,f127,f117	}
    930 { .mfi;		xma.lu	f117=f39,f127,f117	};;//
    931 //-------------------------------------------------//
    932 // Leaving muliplier's heaven... Quite a ride, huh?
    933 
    934 { .mii;	getf.sig	r31=f47
    935 	add		r25=r25,r24
    936 	mov		carry1=0		};;
    937 { .mii;		getf.sig	r16=f111
    938 	cmp.ltu		p6,p0=r25,r24
    939 	add		r26=r26,r25		};;
    940 { .mfb;		getf.sig	r17=f102	}
    941 { .mii;
    942 (p6)	add		carry1=1,carry1
    943 	cmp.ltu		p6,p0=r26,r25
    944 	add		r27=r27,r26		};;
    945 { .mfb;	nop.m	0x0				}
    946 { .mii;
    947 (p6)	add		carry1=1,carry1
    948 	cmp.ltu		p6,p0=r27,r26
    949 	add		r28=r28,r27		};;
    950 { .mii;		getf.sig	r18=f93
    951 		add		r17=r17,r16
    952 		mov		carry3=0	}
    953 { .mii;
    954 (p6)	add		carry1=1,carry1
    955 	cmp.ltu		p6,p0=r28,r27
    956 	add		r29=r29,r28		};;
    957 { .mii;		getf.sig	r19=f84
    958 		cmp.ltu		p7,p0=r17,r16	}
    959 { .mii;
    960 (p6)	add		carry1=1,carry1
    961 	cmp.ltu		p6,p0=r29,r28
    962 	add		r30=r30,r29		};;
    963 { .mii;		getf.sig	r20=f75
    964 		add		r18=r18,r17	}
    965 { .mii;
    966 (p6)	add		carry1=1,carry1
    967 	cmp.ltu		p6,p0=r30,r29
    968 	add		r31=r31,r30		};;
    969 { .mfb;		getf.sig	r21=f66		}
    970 { .mii;	(p7)	add		carry3=1,carry3
    971 		cmp.ltu		p7,p0=r18,r17
    972 		add		r19=r19,r18	}
    973 { .mfb;	nop.m	0x0				}
    974 { .mii;
    975 (p6)	add		carry1=1,carry1
    976 	cmp.ltu		p6,p0=r31,r30
    977 	add		r31=r31,carry2		};;
    978 { .mfb;		getf.sig	r22=f57		}
    979 { .mii;	(p7)	add		carry3=1,carry3
    980 		cmp.ltu		p7,p0=r19,r18
    981 		add		r20=r20,r19	}
    982 { .mfb;	nop.m	0x0				}
    983 { .mii;
    984 (p6)	add		carry1=1,carry1
    985 	cmp.ltu		p6,p0=r31,carry2	};;
    986 { .mfb;		getf.sig	r23=f48		}
    987 { .mii;	(p7)	add		carry3=1,carry3
    988 		cmp.ltu		p7,p0=r20,r19
    989 		add		r21=r21,r20	}
    990 { .mii;
    991 (p6)	add		carry1=1,carry1		}
    992 { .mfb;	st8		[r33]=r31,16		};;
    993 
    994 { .mfb;	getf.sig	r24=f112		}
    995 { .mii;	(p7)	add		carry3=1,carry3
    996 		cmp.ltu		p7,p0=r21,r20
    997 		add		r22=r22,r21	};;
    998 { .mfb;	getf.sig	r25=f103		}
    999 { .mii;	(p7)	add		carry3=1,carry3
   1000 		cmp.ltu		p7,p0=r22,r21
   1001 		add		r23=r23,r22	};;
   1002 { .mfb;	getf.sig	r26=f94			}
   1003 { .mii;	(p7)	add		carry3=1,carry3
   1004 		cmp.ltu		p7,p0=r23,r22
   1005 		add		r23=r23,carry1	};;
   1006 { .mfb;	getf.sig	r27=f85			}
   1007 { .mii;	(p7)	add		carry3=1,carry3
   1008 		cmp.ltu		p7,p8=r23,carry1};;
   1009 { .mii;	getf.sig	r28=f76
   1010 	add		r25=r25,r24
   1011 	mov		carry1=0		}
   1012 { .mii;		st8		[r32]=r23,16
   1013 	(p7)	add		carry2=1,carry3
   1014 	(p8)	add		carry2=0,carry3	};;
   1015 
   1016 { .mfb;	nop.m	0x0				}
   1017 { .mii;	getf.sig	r29=f67
   1018 	cmp.ltu		p6,p0=r25,r24
   1019 	add		r26=r26,r25		};;
   1020 { .mfb;	getf.sig	r30=f58			}
   1021 { .mii;
   1022 (p6)	add		carry1=1,carry1
   1023 	cmp.ltu		p6,p0=r26,r25
   1024 	add		r27=r27,r26		};;
   1025 { .mfb;		getf.sig	r16=f113	}
   1026 { .mii;
   1027 (p6)	add		carry1=1,carry1
   1028 	cmp.ltu		p6,p0=r27,r26
   1029 	add		r28=r28,r27		};;
   1030 { .mfb;		getf.sig	r17=f104	}
   1031 { .mii;
   1032 (p6)	add		carry1=1,carry1
   1033 	cmp.ltu		p6,p0=r28,r27
   1034 	add		r29=r29,r28		};;
   1035 { .mfb;		getf.sig	r18=f95		}
   1036 { .mii;
   1037 (p6)	add		carry1=1,carry1
   1038 	cmp.ltu		p6,p0=r29,r28
   1039 	add		r30=r30,r29		};;
   1040 { .mii;		getf.sig	r19=f86
   1041 		add		r17=r17,r16
   1042 		mov		carry3=0	}
   1043 { .mii;
   1044 (p6)	add		carry1=1,carry1
   1045 	cmp.ltu		p6,p0=r30,r29
   1046 	add		r30=r30,carry2		};;
   1047 { .mii;		getf.sig	r20=f77
   1048 		cmp.ltu		p7,p0=r17,r16
   1049 		add		r18=r18,r17	}
   1050 { .mii;
   1051 (p6)	add		carry1=1,carry1
   1052 	cmp.ltu		p6,p0=r30,carry2	};;
   1053 { .mfb;		getf.sig	r21=f68		}
   1054 { .mii;	st8		[r33]=r30,16
   1055 (p6)	add		carry1=1,carry1		};;
   1056 
   1057 { .mfb;	getf.sig	r24=f114		}
   1058 { .mii;	(p7)	add		carry3=1,carry3
   1059 		cmp.ltu		p7,p0=r18,r17
   1060 		add		r19=r19,r18	};;
   1061 { .mfb;	getf.sig	r25=f105		}
   1062 { .mii;	(p7)	add		carry3=1,carry3
   1063 		cmp.ltu		p7,p0=r19,r18
   1064 		add		r20=r20,r19	};;
   1065 { .mfb;	getf.sig	r26=f96			}
   1066 { .mii;	(p7)	add		carry3=1,carry3
   1067 		cmp.ltu		p7,p0=r20,r19
   1068 		add		r21=r21,r20	};;
   1069 { .mfb;	getf.sig	r27=f87			}
   1070 { .mii;	(p7)	add		carry3=1,carry3
   1071 		cmp.ltu		p7,p0=r21,r20
   1072 		add		r21=r21,carry1	};;
   1073 { .mib;	getf.sig	r28=f78
   1074 	add		r25=r25,r24		}
   1075 { .mib;	(p7)	add		carry3=1,carry3
   1076 		cmp.ltu		p7,p8=r21,carry1};;
   1077 { .mii;		st8		[r32]=r21,16
   1078 	(p7)	add		carry2=1,carry3
   1079 	(p8)	add		carry2=0,carry3	}
   1080 
   1081 { .mii;	mov		carry1=0
   1082 	cmp.ltu		p6,p0=r25,r24
   1083 	add		r26=r26,r25		};;
   1084 { .mfb;		getf.sig	r16=f115	}
   1085 { .mii;
   1086 (p6)	add		carry1=1,carry1
   1087 	cmp.ltu		p6,p0=r26,r25
   1088 	add		r27=r27,r26		};;
   1089 { .mfb;		getf.sig	r17=f106	}
   1090 { .mii;
   1091 (p6)	add		carry1=1,carry1
   1092 	cmp.ltu		p6,p0=r27,r26
   1093 	add		r28=r28,r27		};;
   1094 { .mfb;		getf.sig	r18=f97		}
   1095 { .mii;
   1096 (p6)	add		carry1=1,carry1
   1097 	cmp.ltu		p6,p0=r28,r27
   1098 	add		r28=r28,carry2		};;
   1099 { .mib;		getf.sig	r19=f88
   1100 		add		r17=r17,r16	}
   1101 { .mib;
   1102 (p6)	add		carry1=1,carry1
   1103 	cmp.ltu		p6,p0=r28,carry2	};;
   1104 { .mii;	st8		[r33]=r28,16
   1105 (p6)	add		carry1=1,carry1		}
   1106 
   1107 { .mii;		mov		carry2=0
   1108 		cmp.ltu		p7,p0=r17,r16
   1109 		add		r18=r18,r17	};;
   1110 { .mfb;	getf.sig	r24=f116		}
   1111 { .mii;	(p7)	add		carry2=1,carry2
   1112 		cmp.ltu		p7,p0=r18,r17
   1113 		add		r19=r19,r18	};;
   1114 { .mfb;	getf.sig	r25=f107		}
   1115 { .mii;	(p7)	add		carry2=1,carry2
   1116 		cmp.ltu		p7,p0=r19,r18
   1117 		add		r19=r19,carry1	};;
   1118 { .mfb;	getf.sig	r26=f98			}
   1119 { .mii;	(p7)	add		carry2=1,carry2
   1120 		cmp.ltu		p7,p0=r19,carry1};;
   1121 { .mii;		st8		[r32]=r19,16
   1122 	(p7)	add		carry2=1,carry2	}
   1123 
   1124 { .mfb;	add		r25=r25,r24		};;
   1125 
   1126 { .mfb;		getf.sig	r16=f117	}
   1127 { .mii;	mov		carry1=0
   1128 	cmp.ltu		p6,p0=r25,r24
   1129 	add		r26=r26,r25		};;
   1130 { .mfb;		getf.sig	r17=f108	}
   1131 { .mii;
   1132 (p6)	add		carry1=1,carry1
   1133 	cmp.ltu		p6,p0=r26,r25
   1134 	add		r26=r26,carry2		};;
   1135 { .mfb;	nop.m	0x0				}
   1136 { .mii;
   1137 (p6)	add		carry1=1,carry1
   1138 	cmp.ltu		p6,p0=r26,carry2	};;
   1139 { .mii;	st8		[r33]=r26,16
   1140 (p6)	add		carry1=1,carry1		}
   1141 
   1142 { .mfb;		add		r17=r17,r16	};;
   1143 { .mfb;	getf.sig	r24=f118		}
   1144 { .mii;		mov		carry2=0
   1145 		cmp.ltu		p7,p0=r17,r16
   1146 		add		r17=r17,carry1	};;
   1147 { .mii;	(p7)	add		carry2=1,carry2
   1148 		cmp.ltu		p7,p0=r17,carry1};;
   1149 { .mii;		st8		[r32]=r17
   1150 	(p7)	add		carry2=1,carry2	};;
   1151 { .mfb;	add		r24=r24,carry2		};;
   1152 { .mib;	st8		[r33]=r24		}
   1153 
   1154 { .mib;	rum		1<<5		// clear um.mfh
   1155 	br.ret.sptk.many	b0	};;
   1156 .endp	bn_mul_comba8#
   1157 #undef	carry3
   1158 #undef	carry2
   1159 #undef	carry1
   1160 #endif
   1161 
   1162 #if 1
   1163 // It's possible to make it faster (see comment to bn_sqr_comba8), but
   1164 // I reckon it doesn't worth the effort. Basically because the routine
   1165 // (actually both of them) practically never called... So I just play
   1166 // same trick as with bn_sqr_comba8.
   1167 //
   1168 // void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
   1169 //
   1170 .global	bn_sqr_comba4#
   1171 .proc	bn_sqr_comba4#
   1172 .align	64
   1173 bn_sqr_comba4:
   1174 	.prologue
   1175 	.save	ar.pfs,r2
   1176 #if defined(_HPUX_SOURCE) && !defined(_LP64)
   1177 { .mii;	alloc   r2=ar.pfs,2,1,0,0
   1178 	addp4	r32=0,r32
   1179 	addp4	r33=0,r33		};;
   1180 { .mii;
   1181 #else
   1182 { .mii;	alloc	r2=ar.pfs,2,1,0,0
   1183 #endif
   1184 	mov	r34=r33
   1185 	add	r14=8,r33		};;
   1186 	.body
   1187 { .mii;	add	r17=8,r34
   1188 	add	r15=16,r33
   1189 	add	r18=16,r34		}
   1190 { .mfb;	add	r16=24,r33
   1191 	br	.L_cheat_entry_point4	};;
   1192 .endp	bn_sqr_comba4#
   1193 #endif
   1194 
   1195 #if 1
   1196 // Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
   1197 //
   1198 // void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   1199 //
   1200 #define	carry1	r14
   1201 #define	carry2	r15
   1202 .global	bn_mul_comba4#
   1203 .proc	bn_mul_comba4#
   1204 .align	64
   1205 bn_mul_comba4:
   1206 	.prologue
   1207 	.save	ar.pfs,r2
   1208 #if defined(_HPUX_SOURCE) && !defined(_LP64)
   1209 { .mii;	alloc   r2=ar.pfs,3,0,0,0
   1210 	addp4	r33=0,r33
   1211 	addp4	r34=0,r34		};;
   1212 { .mii;	addp4	r32=0,r32
   1213 #else
   1214 { .mii;	alloc	r2=ar.pfs,3,0,0,0
   1215 #endif
   1216 	add	r14=8,r33
   1217 	add	r17=8,r34		}
   1218 	.body
   1219 { .mii;	add	r15=16,r33
   1220 	add	r18=16,r34
   1221 	add	r16=24,r33		};;
   1222 .L_cheat_entry_point4:
   1223 { .mmi;	add	r19=24,r34
   1224 
   1225 	ldf8	f32=[r33]		}
   1226 
   1227 { .mmi;	ldf8	f120=[r34]
   1228 	ldf8	f121=[r17]		};;
   1229 { .mmi;	ldf8	f122=[r18]
   1230 	ldf8	f123=[r19]		}
   1231 
   1232 { .mmi;	ldf8	f33=[r14]
   1233 	ldf8	f34=[r15]		}
   1234 { .mfi;	ldf8	f35=[r16]
   1235 
   1236 		xma.hu	f41=f32,f120,f0		}
   1237 { .mfi;		xma.lu	f40=f32,f120,f0		};;
   1238 { .mfi;		xma.hu	f51=f32,f121,f0		}
   1239 { .mfi;		xma.lu	f50=f32,f121,f0		};;
   1240 { .mfi;		xma.hu	f61=f32,f122,f0		}
   1241 { .mfi;		xma.lu	f60=f32,f122,f0		};;
   1242 { .mfi;		xma.hu	f71=f32,f123,f0		}
   1243 { .mfi;		xma.lu	f70=f32,f123,f0		};;//
   1244 // Major stall takes place here, and 3 more places below. Result from
   1245 // first xma is not available for another 3 ticks.
   1246 { .mfi;	getf.sig	r16=f40
   1247 		xma.hu	f42=f33,f120,f41
   1248 	add		r33=8,r32		}
   1249 { .mfi;		xma.lu	f41=f33,f120,f41	};;
   1250 { .mfi;	getf.sig	r24=f50
   1251 		xma.hu	f52=f33,f121,f51	}
   1252 { .mfi;		xma.lu	f51=f33,f121,f51	};;
   1253 { .mfi;	st8		[r32]=r16,16
   1254 		xma.hu	f62=f33,f122,f61	}
   1255 { .mfi;		xma.lu	f61=f33,f122,f61	};;
   1256 { .mfi;		xma.hu	f72=f33,f123,f71	}
   1257 { .mfi;		xma.lu	f71=f33,f123,f71	};;//
   1258 //-------------------------------------------------//
   1259 { .mfi;	getf.sig	r25=f41
   1260 		xma.hu	f43=f34,f120,f42	}
   1261 { .mfi;		xma.lu	f42=f34,f120,f42	};;
   1262 { .mfi;	getf.sig	r16=f60
   1263 		xma.hu	f53=f34,f121,f52	}
   1264 { .mfi;		xma.lu	f52=f34,f121,f52	};;
   1265 { .mfi;	getf.sig	r17=f51
   1266 		xma.hu	f63=f34,f122,f62
   1267 	add		r25=r25,r24		}
   1268 { .mfi;	mov		carry1=0
   1269 		xma.lu	f62=f34,f122,f62	};;
   1270 { .mfi;	st8		[r33]=r25,16
   1271 		xma.hu	f73=f34,f123,f72
   1272 	cmp.ltu		p6,p0=r25,r24		}
   1273 { .mfi;		xma.lu	f72=f34,f123,f72	};;//
   1274 //-------------------------------------------------//
   1275 { .mfi;	getf.sig	r18=f42
   1276 		xma.hu	f44=f35,f120,f43
   1277 (p6)	add		carry1=1,carry1		}
   1278 { .mfi;	add		r17=r17,r16
   1279 		xma.lu	f43=f35,f120,f43
   1280 	mov		carry2=0		};;
   1281 { .mfi;	getf.sig	r24=f70
   1282 		xma.hu	f54=f35,f121,f53
   1283 	cmp.ltu		p7,p0=r17,r16		}
   1284 { .mfi;		xma.lu	f53=f35,f121,f53	};;
   1285 { .mfi;	getf.sig	r25=f61
   1286 		xma.hu	f64=f35,f122,f63
   1287 	add		r18=r18,r17		}
   1288 { .mfi;		xma.lu	f63=f35,f122,f63
   1289 (p7)	add		carry2=1,carry2		};;
   1290 { .mfi;	getf.sig	r26=f52
   1291 		xma.hu	f74=f35,f123,f73
   1292 	cmp.ltu		p7,p0=r18,r17		}
   1293 { .mfi;		xma.lu	f73=f35,f123,f73
   1294 	add		r18=r18,carry1		};;
   1295 //-------------------------------------------------//
   1296 { .mii;	st8		[r32]=r18,16
   1297 (p7)	add		carry2=1,carry2
   1298 	cmp.ltu		p7,p0=r18,carry1	};;
   1299 
   1300 { .mfi;	getf.sig	r27=f43	// last major stall
   1301 (p7)	add		carry2=1,carry2		};;
   1302 { .mii;		getf.sig	r16=f71
   1303 	add		r25=r25,r24
   1304 	mov		carry1=0		};;
   1305 { .mii;		getf.sig	r17=f62
   1306 	cmp.ltu		p6,p0=r25,r24
   1307 	add		r26=r26,r25		};;
   1308 { .mii;
   1309 (p6)	add		carry1=1,carry1
   1310 	cmp.ltu		p6,p0=r26,r25
   1311 	add		r27=r27,r26		};;
   1312 { .mii;
   1313 (p6)	add		carry1=1,carry1
   1314 	cmp.ltu		p6,p0=r27,r26
   1315 	add		r27=r27,carry2		};;
   1316 { .mii;		getf.sig	r18=f53
   1317 (p6)	add		carry1=1,carry1
   1318 	cmp.ltu		p6,p0=r27,carry2	};;
   1319 { .mfi;	st8		[r33]=r27,16
   1320 (p6)	add		carry1=1,carry1		}
   1321 
   1322 { .mii;		getf.sig	r19=f44
   1323 		add		r17=r17,r16
   1324 		mov		carry2=0	};;
   1325 { .mii;	getf.sig	r24=f72
   1326 		cmp.ltu		p7,p0=r17,r16
   1327 		add		r18=r18,r17	};;
   1328 { .mii;	(p7)	add		carry2=1,carry2
   1329 		cmp.ltu		p7,p0=r18,r17
   1330 		add		r19=r19,r18	};;
   1331 { .mii;	(p7)	add		carry2=1,carry2
   1332 		cmp.ltu		p7,p0=r19,r18
   1333 		add		r19=r19,carry1	};;
   1334 { .mii;	getf.sig	r25=f63
   1335 	(p7)	add		carry2=1,carry2
   1336 		cmp.ltu		p7,p0=r19,carry1};;
   1337 { .mii;		st8		[r32]=r19,16
   1338 	(p7)	add		carry2=1,carry2	}
   1339 
   1340 { .mii;	getf.sig	r26=f54
   1341 	add		r25=r25,r24
   1342 	mov		carry1=0		};;
   1343 { .mii;		getf.sig	r16=f73
   1344 	cmp.ltu		p6,p0=r25,r24
   1345 	add		r26=r26,r25		};;
   1346 { .mii;
   1347 (p6)	add		carry1=1,carry1
   1348 	cmp.ltu		p6,p0=r26,r25
   1349 	add		r26=r26,carry2		};;
   1350 { .mii;		getf.sig	r17=f64
   1351 (p6)	add		carry1=1,carry1
   1352 	cmp.ltu		p6,p0=r26,carry2	};;
   1353 { .mii;	st8		[r33]=r26,16
   1354 (p6)	add		carry1=1,carry1		}
   1355 
   1356 { .mii;	getf.sig	r24=f74
   1357 		add		r17=r17,r16
   1358 		mov		carry2=0	};;
   1359 { .mii;		cmp.ltu		p7,p0=r17,r16
   1360 		add		r17=r17,carry1	};;
   1361 
   1362 { .mii;	(p7)	add		carry2=1,carry2
   1363 		cmp.ltu		p7,p0=r17,carry1};;
   1364 { .mii;		st8		[r32]=r17,16
   1365 	(p7)	add		carry2=1,carry2	};;
   1366 
   1367 { .mii;	add		r24=r24,carry2		};;
   1368 { .mii;	st8		[r33]=r24		}
   1369 
   1370 { .mib;	rum		1<<5		// clear um.mfh
   1371 	br.ret.sptk.many	b0	};;
   1372 .endp	bn_mul_comba4#
   1373 #undef	carry2
   1374 #undef	carry1
   1375 #endif
   1376 
   1377 #if 1
   1378 //
   1379 // BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
   1380 //
   1381 // In the nutshell it's a port of my MIPS III/IV implementation.
   1382 //
   1383 #define	AT	r14
   1384 #define	H	r16
   1385 #define	HH	r20
   1386 #define	L	r17
   1387 #define	D	r18
   1388 #define	DH	r22
   1389 #define	I	r21
   1390 
   1391 #if 0
   1392 // Some preprocessors (most notably HP-UX) appear to be allergic to
   1393 // macros enclosed to parenthesis [as these three were].
   1394 #define	cont	p16
   1395 #define	break	p0	// p20
   1396 #define	equ	p24
   1397 #else
   1398 cont=p16
   1399 break=p0
   1400 equ=p24
   1401 #endif
   1402 
   1403 .global	abort#
   1404 .global	bn_div_words#
   1405 .proc	bn_div_words#
   1406 .align	64
   1407 bn_div_words:
   1408 	.prologue
   1409 	.save	ar.pfs,r2
   1410 { .mii;	alloc		r2=ar.pfs,3,5,0,8
   1411 	.save	b0,r3
   1412 	mov		r3=b0
   1413 	.save	pr,r10
   1414 	mov		r10=pr		};;
   1415 { .mmb;	cmp.eq		p6,p0=r34,r0
   1416 	mov		r8=-1
   1417 (p6)	br.ret.spnt.many	b0	};;
   1418 
   1419 	.body
   1420 { .mii;	mov		H=r32		// save h
   1421 	mov		ar.ec=0		// don't rotate at exit
   1422 	mov		pr.rot=0	}
   1423 { .mii;	mov		L=r33		// save l
   1424 	mov		r36=r0		};;
   1425 
   1426 .L_divw_shift:	// -vv- note signed comparison
   1427 { .mfi;	(p0)	cmp.lt		p16,p0=r0,r34	// d
   1428 	(p0)	shladd		r33=r34,1,r0	}
   1429 { .mfb;	(p0)	add		r35=1,r36
   1430 	(p0)	nop.f		0x0
   1431 (p16)	br.wtop.dpnt		.L_divw_shift	};;
   1432 
   1433 { .mii;	mov		D=r34
   1434 	shr.u		DH=r34,32
   1435 	sub		r35=64,r36		};;
   1436 { .mii;	setf.sig	f7=DH
   1437 	shr.u		AT=H,r35
   1438 	mov		I=r36			};;
   1439 { .mib;	cmp.ne		p6,p0=r0,AT
   1440 	shl		H=H,r36
   1441 (p6)	br.call.spnt.clr	b0=abort	};;	// overflow, die...
   1442 
   1443 { .mfi;	fcvt.xuf.s1	f7=f7
   1444 	shr.u		AT=L,r35		};;
   1445 { .mii;	shl		L=L,r36
   1446 	or		H=H,AT			};;
   1447 
   1448 { .mii;	nop.m		0x0
   1449 	cmp.leu		p6,p0=D,H;;
   1450 (p6)	sub		H=H,D			}
   1451 
   1452 { .mlx;	setf.sig	f14=D
   1453 	movl		AT=0xffffffff		};;
   1454 ///////////////////////////////////////////////////////////
   1455 { .mii;	setf.sig	f6=H
   1456 	shr.u		HH=H,32;;
   1457 	cmp.eq		p6,p7=HH,DH		};;
   1458 { .mfb;
   1459 (p6)	setf.sig	f8=AT
   1460 (p7)	fcvt.xuf.s1	f6=f6
   1461 (p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;
   1462 
   1463 { .mfi;	getf.sig	r33=f8				// q
   1464 	xmpy.lu		f9=f8,f14		}
   1465 { .mfi;	xmpy.hu		f10=f8,f14
   1466 	shrp		H=H,L,32		};;
   1467 
   1468 { .mmi;	getf.sig	r35=f9				// tl
   1469 	getf.sig	r31=f10			};;	// th
   1470 
   1471 .L_divw_1st_iter:
   1472 { .mii;	(p0)	add		r32=-1,r33
   1473 	(p0)	cmp.eq		equ,cont=HH,r31		};;
   1474 { .mii;	(p0)	cmp.ltu		p8,p0=r35,D
   1475 	(p0)	sub		r34=r35,D
   1476 	(equ)	cmp.leu		break,cont=r35,H	};;
   1477 { .mib;	(cont)	cmp.leu		cont,break=HH,r31
   1478 	(p8)	add		r31=-1,r31
   1479 (cont)	br.wtop.spnt		.L_divw_1st_iter	};;
   1480 ///////////////////////////////////////////////////////////
   1481 { .mii;	sub		H=H,r35
   1482 	shl		r8=r33,32
   1483 	shl		L=L,32			};;
   1484 ///////////////////////////////////////////////////////////
   1485 { .mii;	setf.sig	f6=H
   1486 	shr.u		HH=H,32;;
   1487 	cmp.eq		p6,p7=HH,DH		};;
   1488 { .mfb;
   1489 (p6)	setf.sig	f8=AT
   1490 (p7)	fcvt.xuf.s1	f6=f6
   1491 (p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;
   1492 
   1493 { .mfi;	getf.sig	r33=f8				// q
   1494 	xmpy.lu		f9=f8,f14		}
   1495 { .mfi;	xmpy.hu		f10=f8,f14
   1496 	shrp		H=H,L,32		};;
   1497 
   1498 { .mmi;	getf.sig	r35=f9				// tl
   1499 	getf.sig	r31=f10			};;	// th
   1500 
   1501 .L_divw_2nd_iter:
   1502 { .mii;	(p0)	add		r32=-1,r33
   1503 	(p0)	cmp.eq		equ,cont=HH,r31		};;
   1504 { .mii;	(p0)	cmp.ltu		p8,p0=r35,D
   1505 	(p0)	sub		r34=r35,D
   1506 	(equ)	cmp.leu		break,cont=r35,H	};;
   1507 { .mib;	(cont)	cmp.leu		cont,break=HH,r31
   1508 	(p8)	add		r31=-1,r31
   1509 (cont)	br.wtop.spnt		.L_divw_2nd_iter	};;
   1510 ///////////////////////////////////////////////////////////
   1511 { .mii;	sub	H=H,r35
   1512 	or	r8=r8,r33
   1513 	mov	ar.pfs=r2		};;
   1514 { .mii;	shr.u	r9=H,I			// remainder if anybody wants it
   1515 	mov	pr=r10,0x1ffff		}
   1516 { .mfb;	br.ret.sptk.many	b0	};;
   1517 
   1518 // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
   1519 // procedure.
   1520 //
   1521 // inputs:	f6 = (double)a, f7 = (double)b
   1522 // output:	f8 = (int)(a/b)
   1523 // clobbered:	f8,f9,f10,f11,pred
   1524 pred=p15
   1525 // One can argue that this snippet is copyrighted to Intel
   1526 // Corporation, as it's essentially identical to one of those
   1527 // found in "Divide, Square Root and Remainder" section at
   1528 // http://www.intel.com/software/products/opensource/libraries/num.htm.
   1529 // Yes, I admit that the referred code was used as template,
   1530 // but after I realized that there hardly is any other instruction
   1531 // sequence which would perform this operation. I mean I figure that
   1532 // any independent attempt to implement high-performance division
   1533 // will result in code virtually identical to the Intel code. It
   1534 // should be noted though that below division kernel is 1 cycle
   1535 // faster than Intel one (note commented splits:-), not to mention
   1536 // original prologue (rather lack of one) and epilogue.
   1537 .align	32
   1538 .skip	16
   1539 .L_udiv64_32_b6:
   1540 	frcpa.s1	f8,pred=f6,f7;;		// [0]  y0 = 1 / b
   1541 
   1542 (pred)	fnma.s1		f9=f7,f8,f1		// [5]  e0 = 1 - b * y0
   1543 (pred)	fmpy.s1		f10=f6,f8;;		// [5]  q0 = a * y0
   1544 (pred)	fmpy.s1		f11=f9,f9		// [10] e1 = e0 * e0
   1545 (pred)	fma.s1		f10=f9,f10,f10;;	// [10] q1 = q0 + e0 * q0
   1546 (pred)	fma.s1		f8=f9,f8,f8	//;;	// [15] y1 = y0 + e0 * y0
   1547 (pred)	fma.s1		f9=f11,f10,f10;;	// [15] q2 = q1 + e1 * q1
   1548 (pred)	fma.s1		f8=f11,f8,f8	//;;	// [20] y2 = y1 + e1 * y1
   1549 (pred)	fnma.s1		f10=f7,f9,f6;;		// [20] r2 = a - b * q2
   1550 (pred)	fma.s1		f8=f10,f8,f9;;		// [25] q3 = q2 + r2 * y2
   1551 
   1552 	fcvt.fxu.trunc.s1	f8=f8		// [30] q = trunc(q3)
   1553 	br.ret.sptk.many	b6;;
   1554 .endp	bn_div_words#
   1555 #endif
   1556