Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by David Mosberger <David.Mosberger (at] acm.org> based on the
      5 # Itanium optimized Crypto code which was released by HP Labs at
      6 # http://www.hpl.hp.com/research/linux/crypto/.
      7 #
      8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
      9 #
     10 # Permission is hereby granted, free of charge, to any person obtaining
     11 # a copy of this software and associated documentation files (the
     12 # "Software"), to deal in the Software without restriction, including
     13 # without limitation the rights to use, copy, modify, merge, publish,
     14 # distribute, sublicense, and/or sell copies of the Software, and to
     15 # permit persons to whom the Software is furnished to do so, subject to
     16 # the following conditions:
     17 #
     18 # The above copyright notice and this permission notice shall be
     19 # included in all copies or substantial portions of the Software.
     20 
     21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
     25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
     28 
     29 
     30 
     31 # This is a little helper program which generates a software-pipelined
     32 # for RC4 encryption.  The basic algorithm looks like this:
     33 #
     34 #   for (counter = 0; counter < len; ++counter)
     35 #     {
     36 #       in = inp[counter];
     37 #       SI = S[I];
     38 #       J = (SI + J) & 0xff;
     39 #       SJ = S[J];
     40 #       T = (SI + SJ) & 0xff;
     41 #       S[I] = SJ, S[J] = SI;
     42 #       ST = S[T];
     43 #       outp[counter] = in ^ ST;
     44 #       I = (I + 1) & 0xff;
     45 #     }
     46 #
     47 # Pipelining this loop isn't easy, because the stores to the S[] array
     48 # need to be observed in the right order.  The loop generated by the
     49 # code below has the following pipeline diagram:
     50 #
     51 #      cycle
     52 #     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
     53 # iter
     54 #   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
     55 #   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
     56 #   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
     57 #
     58 #   where:
     59 # 	LDI = load of S[I]
     60 # 	LDJ = load of S[J]
     61 # 	SWP = swap of S[I] and S[J]
     62 # 	LDT = load of S[T]
     63 #
     64 # Note that in the above diagram, the major trouble-spot is that LDI
     65 # of the 2nd iteration is performed BEFORE the SWP of the first
     66 # iteration.  Fortunately, this is easy to detect (I of the 1st
     67 # iteration will be equal to J of the 2nd iteration) and when this
     68 # happens, we simply forward the proper value from the 1st iteration
     69 # to the 2nd one.  The proper value in this case is simply the value
     70 # of S[I] from the first iteration (thanks to the fact that SWP
     71 # simply swaps the contents of S[I] and S[J]).
     72 #
     73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st
     74 # iteration issues at the same time as the LDI of the 3rd iteration.
     75 # However, thanks to IA-64 execution semantics, this can be taken
     76 # care of simply by placing LDI later in the instruction-group than
     77 # SWP.  IA-64 CPUs will automatically forward the value if they
     78 # detect that the SWP and LDI are accessing the same memory-location.
     79 
     80 # The core-loop that can be pipelined then looks like this (annotated
     81 # with McKinley/Madison issue port & latency numbers, assuming L1
     82 # cache hits for the most part):
     83 
     84 # operation:	    instruction:		    issue-ports:  latency
     85 # ------------------  -----------------------------   ------------- -------
     86 
     87 # Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
     88 #                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
     89 # I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
     90 #                     ;;
     91 # SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
     92 #                     ;;
     93 #                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
     94 # J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
     95 #                     (pBypass) br.cond.spnt Bypass
     96 #                     ;;
     97 # ---------------------------------------------------------------------------------------
     98 # J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
     99 #                     ;;
    100 #                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
    101 #                     ;;
    102 # SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
    103 #                     ;;
    104 # ---------------------------------------------------------------------------------------
    105 # T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
    106 #                     ;;
    107 # T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
    108 # S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
    109 # S[J] = SI           st8 [Jptr] = SI                 M2-M3
    110 #                     ;;
    111 #                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
    112 #                     ;;
    113 # ---------------------------------------------------------------------------------------
    114 # T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
    115 #                     ;;
    116 # data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
    117 #                     ;;
    118 # *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
    119 #                     ;;
    120 # ---------------------------------------------------------------------------------------
    121 
    122 # There are several points worth making here:
    123 
    124 #   - Note that due to the bypass/forwarding-path, the first two
    125 #     phases of the loop are strangly mingled together.  In
    126 #     particular, note that the first stage of the pipeline is
    127 #     using the value of "J", as calculated by the second stage.
    128 #   - Each bundle-pair will have exactly 6 instructions.
    129 #   - Pipelined, the loop can execute in 3 cycles/iteration and
    130 #     4 stages.  However, McKinley/Madison can issue "st1" to
    131 #     the same bank at a rate of at most one per 4 cycles.  Thus,
    132 #     instead of storing each byte, we accumulate them in a word
    133 #     and then write them back at once with a single "st8" (this
    134 #     implies that the setup code needs to ensure that the output
    135 #     buffer is properly aligned, if need be, by encoding the
    136 #     first few bytes separately).
    137 #   - There is no space for a "br.ctop" instruction.  For this
    138 #     reason we can't use module-loop support in IA-64 and have
    139 #     to do a traditional, purely software-pipelined loop.
    140 #   - We can't replace any of the remaining "add/zxt1" pairs with
    141 #     "padd1" because the latency for that instruction is too high
    142 #     and would push the loop to the point where more bypasses
    143 #     would be needed, which we don't have space for.
    144 #   - The above loop runs at around 3.26 cycles/byte, or roughly
    145 #     440 MByte/sec on a 1.5GHz Madison.  This is well below the
    146 #     system bus bandwidth and hence with judicious use of
    147 #     "lfetch" this loop can run at (almost) peak speed even when
    148 #     the input and output data reside in memory.  The
    149 #     max. latency that can be tolerated is (PREFETCH_DISTANCE *
    150 #     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
    151 #     least) 1-ahead prefetching of 128 byte cache-lines.  Note
    152 #     that we do NOT prefetch into L1, since that would only
    153 #     interfere with the S[] table values stored there.  This is
    154 #     acceptable because there is a 10 cycle latency between
    155 #     load and first use of the input data.
    156 #   - We use a branch to out-of-line bypass-code of cycle-pressure:
    157 #     we calculate the next J, check for the need to activate the
    158 #     bypass path, and activate the bypass path ALL IN THE SAME
    159 #     CYCLE.  If we didn't have these constraints, we could do
    160 #     the bypass with a simple conditional move instruction.
    161 #     Fortunately, the bypass paths get activated relatively
    162 #     infrequently, so the extra branches don't cost all that much
    163 #     (about 0.04 cycles/byte, measured on a 16396 byte file with
    164 #     random input data).
    165 #
    166 
    167 $phases = 4;		# number of stages/phases in the pipelined-loop
    168 $unroll_count = 6;	# number of times we unrolled it
    169 $pComI = (1 << 0);
    170 $pComJ = (1 << 1);
    171 $pComT = (1 << 2);
    172 $pOut  = (1 << 3);
    173 
    174 $NData = 4;
    175 $NIP = 3;
    176 $NJP = 2;
    177 $NI = 2;
    178 $NSI = 3;
    179 $NSJ = 2;
    180 $NT = 2;
    181 $NOutWord = 2;
    182 
    183 #
    184 # $threshold is the minimum length before we attempt to use the
    185 # big software-pipelined loop.  It MUST be greater-or-equal
    186 # to:
    187 #  		PHASES * (UNROLL_COUNT + 1) + 7
    188 #
    189 # The "+ 7" comes from the fact we may have to encode up to
    190 #   7 bytes separately before the output pointer is aligned.
    191 #
    192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
    193 
    194 sub I {
    195     local *code = shift;
    196     local $format = shift;
    197     $code .= sprintf ("\t\t".$format."\n", @_);
    198 }
    199 
    200 sub P {
    201     local *code = shift;
    202     local $format = shift;
    203     $code .= sprintf ($format."\n", @_);
    204 }
    205 
    206 sub STOP {
    207     local *code = shift;
    208     $code .=<<___;
    209 		;;
    210 ___
    211 }
    212 
    213 sub emit_body {
    214     local *c = shift;
    215     local *bypass = shift;
    216     local ($iteration, $p) = @_;
    217 
    218     local $i0 = $iteration;
    219     local $i1 = $iteration - 1;
    220     local $i2 = $iteration - 2;
    221     local $i3 = $iteration - 3;
    222     local $iw0 = ($iteration - 3) / 8;
    223     local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
    224     local $byte_num = ($iteration - 3) % 8;
    225     local $label = $iteration + 1;
    226     local $pAny = ($p & 0xf) == 0xf;
    227     local $pByp = (($p & $pComI) && ($iteration > 0));
    228 
    229     $c.=<<___;
    230 //////////////////////////////////////////////////
    231 ___
    232 
    233     if (($p & 0xf) == 0) {
    234 	$c.="#ifdef HOST_IS_BIG_ENDIAN\n";
    235 	&I(\$c,"shr.u	OutWord[%u] = OutWord[%u], 32;;",
    236 				$iw1 % $NOutWord, $iw1 % $NOutWord);
    237 	$c.="#endif\n";
    238 	&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
    239 	return;
    240     }
    241 
    242     # Cycle 0
    243     &I(\$c, "{ .mmi")					      if ($pAny);
    244     &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
    245     &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
    246     &I(\$c, "zxt1   J = J")				      if ($p & $pComJ);
    247     &I(\$c, "}")					      if ($pAny);
    248     &I(\$c, "{ .mmi")					      if ($pAny);
    249     &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
    250     &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
    251        $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)		      if ($p & $pComT);
    252     &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
    253     &I(\$c, "}")					      if ($pAny);
    254     &STOP(\$c);
    255 
    256     # Cycle 1
    257     &I(\$c, "{ .mmi")					      if ($pAny);
    258     &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
    259     &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
    260     &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
    261     &I(\$c, "}")					      if ($pAny);
    262     &I(\$c, "{ .mmi")					      if ($pAny);
    263     &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
    264     &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)		      if ($p & $pComJ);
    265     &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
    266        $i3 % $NData, $i3 % $NData, $i1 % $NT)		      if ($p & $pOut);
    267     &I(\$c, "}")					      if ($pAny);
    268     &STOP(\$c);
    269 
    270     # Cycle 2
    271     &I(\$c, "{ .mmi")					      if ($pAny);
    272     &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
    273     &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)	      if ($pByp);
    274     &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
    275        $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
    276     &I(\$c, "}")					      if ($pAny);
    277     &I(\$c, "{ .mmb")					      if ($pAny);
    278     &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)		      if ($p & $pComI);
    279     &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
    280     &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
    281     &I(\$c, "}") if ($pAny);
    282     &STOP(\$c);
    283 
    284     &P(\$c, ".rc4Resume%u:", $label)			      if ($pByp);
    285     if ($byte_num == 0 && $iteration >= $phases) {
    286 	&I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
    287 	   $iw1 % $NOutWord)				      if ($p & $pOut);
    288 	if ($iteration == (1 + $unroll_count) * $phases - 1) {
    289 	    if ($unroll_count == 6) {
    290 		&I(\$c, "mov OutWord[%u] = OutWord[%u]",
    291 		   $iw1 % $NOutWord, $iw0 % $NOutWord);
    292 	    }
    293 	    &I(\$c, "lfetch.nt1 [InPrefetch], %u",
    294 	       $unroll_count * $phases);
    295 	    &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
    296 	       $unroll_count * $phases);
    297 	    &I(\$c, "br.cloop.sptk.few .rc4Loop");
    298 	}
    299     }
    300 
    301     if ($pByp) {
    302 	&P(\$bypass, ".rc4Bypass%u:", $label);
    303 	&I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
    304 	&I(\$bypass, "nop 0");
    305 	&I(\$bypass, "nop 0");
    306 	&I(\$bypass, ";;");
    307 	&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
    308 	&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
    309 	&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
    310 	&I(\$bypass, ";;");
    311     }
    312 }
    313 
    314 $code=<<___;
    315 .ident \"rc4-ia64.s, version 3.0\"
    316 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
    317 
    318 #define LCSave		r8
    319 #define PRSave		r9
    320 
    321 /* Inputs become invalid once rotation begins!  */
    322 
    323 #define StateTable	in0
    324 #define DataLen		in1
    325 #define InputBuffer	in2
    326 #define OutputBuffer	in3
    327 
    328 #define KTable		r14
    329 #define J		r15
    330 #define InPtr		r16
    331 #define OutPtr		r17
    332 #define InPrefetch	r18
    333 #define OutPrefetch	r19
    334 #define One		r20
    335 #define LoopCount	r21
    336 #define Remainder	r22
    337 #define IFinal		r23
    338 #define EndPtr		r24
    339 
    340 #define tmp0		r25
    341 #define tmp1		r26
    342 
    343 #define pBypass		p6
    344 #define pDone		p7
    345 #define pSmall		p8
    346 #define pAligned	p9
    347 #define pUnaligned	p10
    348 
    349 #define pComputeI	pPhase[0]
    350 #define pComputeJ	pPhase[1]
    351 #define pComputeT	pPhase[2]
    352 #define pOutput		pPhase[3]
    353 
    354 #define RetVal		r8
    355 #define L_OK		p7
    356 #define L_NOK		p8
    357 
    358 #define	_NINPUTS	4
    359 #define	_NOUTPUT	0
    360 
    361 #define	_NROTATE	24
    362 #define	_NLOCALS	(_NROTATE - _NINPUTS - _NOUTPUT)
    363 
    364 #ifndef SZ
    365 # define SZ	4	// this must be set to sizeof(RC4_INT)
    366 #endif
    367 
    368 #if SZ == 1
    369 # define LKEY			ld1
    370 # define SKEY			st1
    371 # define KEYADDR(dst, i)	add dst = i, KTable
    372 #elif SZ == 2
    373 # define LKEY			ld2
    374 # define SKEY			st2
    375 # define KEYADDR(dst, i)	shladd dst = i, 1, KTable
    376 #elif SZ == 4
    377 # define LKEY			ld4
    378 # define SKEY			st4
    379 # define KEYADDR(dst, i)	shladd dst = i, 2, KTable
    380 #else
    381 # define LKEY			ld8
    382 # define SKEY			st8
    383 # define KEYADDR(dst, i)	shladd dst = i, 3, KTable
    384 #endif
    385 
    386 #if defined(_HPUX_SOURCE) && !defined(_LP64)
    387 # define ADDP	addp4
    388 #else
    389 # define ADDP	add
    390 #endif
    391 
    392 /* Define a macro for the bit number of the n-th byte: */
    393 
    394 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
    395 # define HOST_IS_BIG_ENDIAN
    396 # define BYTE_POS(n)	(56 - (8 * (n)))
    397 #else
    398 # define BYTE_POS(n)	(8 * (n))
    399 #endif
    400 
    401 /*
    402    We must perform the first phase of the pipeline explicitly since
    403    we will always load from the stable the first time. The br.cexit
    404    will never be taken since regardless of the number of bytes because
    405    the epilogue count is 4.
    406 */
    407 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
    408    assembler failed on original macro with syntax error. <appro> */
    409 #define MODSCHED_RC4_PROLOGUE						   \\
    410 	{								   \\
    411 				ld1		Data[0] = [InPtr], 1;	   \\
    412 				add		IFinal = 1, I[1];	   \\
    413 				KEYADDR(IPr[0], I[1]);			   \\
    414 	} ;;								   \\
    415 	{								   \\
    416 				LKEY		SI[0] = [IPr[0]];	   \\
    417 				mov		pr.rot = 0x10000;	   \\
    418 				mov		ar.ec = 4;		   \\
    419 	} ;;								   \\
    420 	{								   \\
    421 				add		J = J, SI[0];		   \\
    422 				zxt1		I[0] = IFinal;		   \\
    423 				br.cexit.spnt.few .+16; /* never taken */  \\
    424 	} ;;
    425 #define MODSCHED_RC4_LOOP(label)					   \\
    426 label:									   \\
    427 	{	.mmi;							   \\
    428 		(pComputeI)	ld1		Data[0] = [InPtr], 1;	   \\
    429 		(pComputeI)	add		IFinal = 1, I[1];	   \\
    430 		(pComputeJ)	zxt1		J = J;			   \\
    431 	}{	.mmi;							   \\
    432 		(pOutput)	LKEY		T[1] = [T[1]];		   \\
    433 		(pComputeT)	add		T[0] = SI[2], SJ[1];	   \\
    434 		(pComputeI)	KEYADDR(IPr[0], I[1]);			   \\
    435 	} ;;								   \\
    436 	{	.mmi;							   \\
    437 		(pComputeT)	SKEY		[IPr[2]] = SJ[1];	   \\
    438 		(pComputeT)	SKEY		[JP[1]] = SI[2];	   \\
    439 		(pComputeT)	zxt1		T[0] = T[0];		   \\
    440 	}{	.mmi;							   \\
    441 		(pComputeI)	LKEY		SI[0] = [IPr[0]];	   \\
    442 		(pComputeJ)	KEYADDR(JP[0], J);			   \\
    443 		(pComputeI)	cmp.eq.unc	pBypass, p0 = I[1], J;	   \\
    444 	} ;;								   \\
    445 	{	.mmi;							   \\
    446 		(pComputeJ)	LKEY		SJ[0] = [JP[0]];	   \\
    447 		(pOutput)	xor		Data[3] = Data[3], T[1];   \\
    448 				nop		0x0;			   \\
    449 	}{	.mmi;							   \\
    450 		(pComputeT)	KEYADDR(T[0], T[0]);			   \\
    451 		(pBypass)	mov		SI[0] = SI[1];		   \\
    452 		(pComputeI)	zxt1		I[0] = IFinal;		   \\
    453 	} ;;								   \\
    454 	{	.mmb;							   \\
    455 		(pOutput)	st1		[OutPtr] = Data[3], 1;	   \\
    456 		(pComputeI)	add		J = J, SI[0];		   \\
    457 				br.ctop.sptk.few label;			   \\
    458 	} ;;
    459 
    460 	.text
    461 
    462 	.align	32
    463 
    464 	.type	RC4, \@function
    465 	.global	RC4
    466 
    467 	.proc	RC4
    468 	.prologue
    469 
    470 RC4:
    471 	{
    472 	  	.mmi
    473 		alloc	r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
    474 
    475 		.rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
    476 		      OutWord[2]
    477 		.rotp pPhase[4]
    478 
    479 		ADDP		InPrefetch = 0, InputBuffer
    480 		ADDP		KTable = 0, StateTable
    481 	}
    482 	{
    483 		.mmi
    484 		ADDP		InPtr = 0, InputBuffer
    485 		ADDP		OutPtr = 0, OutputBuffer
    486 		mov		RetVal = r0
    487 	}
    488 	;;
    489 	{
    490 		.mmi
    491 		lfetch.nt1	[InPrefetch], 0x80
    492 		ADDP		OutPrefetch = 0, OutputBuffer
    493 	}
    494 	{               // Return 0 if the input length is nonsensical
    495         	.mib
    496 		ADDP		StateTable = 0, StateTable
    497         	cmp.ge.unc  	L_NOK, L_OK = r0, DataLen
    498 	(L_NOK) br.ret.sptk.few rp
    499 	}
    500 	;;
    501 	{
    502         	.mib
    503         	cmp.eq.or  	L_NOK, L_OK = r0, InPtr
    504         	cmp.eq.or  	L_NOK, L_OK = r0, OutPtr
    505 		nop		0x0
    506 	}
    507 	{
    508 		.mib
    509         	cmp.eq.or  	L_NOK, L_OK = r0, StateTable
    510 		nop		0x0
    511 	(L_NOK) br.ret.sptk.few rp
    512 	}
    513 	;;
    514 		LKEY		I[1] = [KTable], SZ
    515 /* Prefetch the state-table. It contains 256 elements of size SZ */
    516 
    517 #if SZ == 1
    518 		ADDP		tmp0 = 1*128, StateTable
    519 #elif SZ == 2
    520 		ADDP		tmp0 = 3*128, StateTable
    521 		ADDP		tmp1 = 2*128, StateTable
    522 #elif SZ == 4
    523 		ADDP		tmp0 = 7*128, StateTable
    524 		ADDP		tmp1 = 6*128, StateTable
    525 #elif SZ == 8
    526 		ADDP		tmp0 = 15*128, StateTable
    527 		ADDP		tmp1 = 14*128, StateTable
    528 #endif
    529 		;;
    530 #if SZ >= 8
    531 		lfetch.fault.nt1		[tmp0], -256	// 15
    532 		lfetch.fault.nt1		[tmp1], -256;;
    533 		lfetch.fault.nt1		[tmp0], -256	// 13
    534 		lfetch.fault.nt1		[tmp1], -256;;
    535 		lfetch.fault.nt1		[tmp0], -256	// 11
    536 		lfetch.fault.nt1		[tmp1], -256;;
    537 		lfetch.fault.nt1		[tmp0], -256	//  9
    538 		lfetch.fault.nt1		[tmp1], -256;;
    539 #endif
    540 #if SZ >= 4
    541 		lfetch.fault.nt1		[tmp0], -256	//  7
    542 		lfetch.fault.nt1		[tmp1], -256;;
    543 		lfetch.fault.nt1		[tmp0], -256	//  5
    544 		lfetch.fault.nt1		[tmp1], -256;;
    545 #endif
    546 #if SZ >= 2
    547 		lfetch.fault.nt1		[tmp0], -256	//  3
    548 		lfetch.fault.nt1		[tmp1], -256;;
    549 #endif
    550 	{
    551 		.mii
    552 		lfetch.fault.nt1		[tmp0]		//  1
    553 		add		I[1]=1,I[1];;
    554 		zxt1		I[1]=I[1]
    555 	}
    556 	{
    557 		.mmi
    558 		lfetch.nt1	[InPrefetch], 0x80
    559 		lfetch.excl.nt1	[OutPrefetch], 0x80
    560 		.save		pr, PRSave
    561 		mov		PRSave = pr
    562 	} ;;
    563 	{
    564 		.mmi
    565 		lfetch.excl.nt1	[OutPrefetch], 0x80
    566 		LKEY		J = [KTable], SZ
    567 		ADDP		EndPtr = DataLen, InPtr
    568 	}  ;;
    569 	{
    570 		.mmi
    571 		ADDP		EndPtr = -1, EndPtr	// Make it point to
    572 							// last data byte.
    573 		mov		One = 1
    574 		.save		ar.lc, LCSave
    575 		mov		LCSave = ar.lc
    576 		.body
    577 	} ;;
    578 	{
    579 		.mmb
    580 		sub		Remainder = 0, OutPtr
    581 		cmp.gtu		pSmall, p0 = $threshold, DataLen
    582 (pSmall)	br.cond.dpnt	.rc4Remainder		// Data too small for
    583 							// big loop.
    584 	} ;;
    585 	{
    586 		.mmi
    587 		and		Remainder = 0x7, Remainder
    588 		;;
    589 		cmp.eq		pAligned, pUnaligned = Remainder, r0
    590 		nop		0x0
    591 	} ;;
    592 	{
    593 		.mmb
    594 .pred.rel	"mutex",pUnaligned,pAligned
    595 (pUnaligned)	add		Remainder = -1, Remainder
    596 (pAligned)	sub		Remainder = EndPtr, InPtr
    597 (pAligned)	br.cond.dptk.many .rc4Aligned
    598 	} ;;
    599 	{
    600 		.mmi
    601 		nop		0x0
    602 		nop		0x0
    603 		mov.i		ar.lc = Remainder
    604 	}
    605 
    606 /* Do the initial few bytes via the compact, modulo-scheduled loop
    607    until the output pointer is 8-byte-aligned.  */
    608 
    609 		MODSCHED_RC4_PROLOGUE
    610 		MODSCHED_RC4_LOOP(.RC4AlignLoop)
    611 
    612 	{
    613 		.mib
    614 		sub		Remainder = EndPtr, InPtr
    615 		zxt1		IFinal = IFinal
    616 		clrrrb				// Clear CFM.rrb.pr so
    617 		;;				// next "mov pr.rot = N"
    618 						// does the right thing.
    619 	}
    620 	{
    621 		.mmi
    622 		mov		I[1] = IFinal
    623 		nop		0x0
    624 		nop		0x0
    625 	} ;;
    626 
    627 
    628 .rc4Aligned:
    629 
    630 /*
    631    Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
    632  */
    633 
    634 	{
    635 		.mlx
    636 		add	LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
    637 		movl		Remainder = 0xaaaaaaaaaaaaaaab
    638 	} ;;
    639 	{
    640 		.mmi
    641 		setf.sig	f6 = LoopCount		// M2, M3	6 cyc
    642 		setf.sig	f7 = Remainder		// M2, M3	6 cyc
    643 		nop		0x0
    644 	} ;;
    645 	{
    646 		.mfb
    647 		nop		0x0
    648 		xmpy.hu		f6 = f6, f7
    649 		nop		0x0
    650 	} ;;
    651 	{
    652 		.mmi
    653 		getf.sig	LoopCount = f6;;	// M2		5 cyc
    654 		nop		0x0
    655 		shr.u		LoopCount = LoopCount, 4
    656 	} ;;
    657 	{
    658 		.mmi
    659 		nop		0x0
    660 		nop		0x0
    661 		mov.i		ar.lc = LoopCount
    662 	} ;;
    663 
    664 /* Now comes the unrolled loop: */
    665 
    666 .rc4Prologue:
    667 ___
    668 
    669 $iteration = 0;
    670 
    671 # Generate the prologue:
    672 $predicates = 1;
    673 for ($i = 0; $i < $phases; ++$i) {
    674     &emit_body (\$code, \$bypass, $iteration++, $predicates);
    675     $predicates = ($predicates << 1) | 1;
    676 }
    677 
    678 $code.=<<___;
    679 .rc4Loop:
    680 ___
    681 
    682 # Generate the body:
    683 for ($i = 0; $i < $unroll_count*$phases; ++$i) {
    684     &emit_body (\$code, \$bypass, $iteration++, $predicates);
    685 }
    686 
    687 $code.=<<___;
    688 .rc4Epilogue:
    689 ___
    690 
    691 # Generate the epilogue:
    692 for ($i = 0; $i < $phases; ++$i) {
    693     $predicates <<= 1;
    694     &emit_body (\$code, \$bypass, $iteration++, $predicates);
    695 }
    696 
    697 $code.=<<___;
    698 	{
    699 		.mmi
    700 		lfetch.nt1	[EndPtr]	// fetch line with last byte
    701 		mov		IFinal = I[1]
    702 		nop		0x0
    703 	}
    704 
    705 .rc4Remainder:
    706 	{
    707 		.mmi
    708 		sub		Remainder = EndPtr, InPtr	// Calculate
    709 								// # of bytes
    710 								// left - 1
    711 		nop		0x0
    712 		nop		0x0
    713 	} ;;
    714 	{
    715 		.mib
    716 		cmp.eq		pDone, p0 = -1, Remainder // done already?
    717 		mov.i		ar.lc = Remainder
    718 (pDone)		br.cond.dptk.few .rc4Complete
    719 	}
    720 
    721 /* Do the remaining bytes via the compact, modulo-scheduled loop */
    722 
    723 		MODSCHED_RC4_PROLOGUE
    724 		MODSCHED_RC4_LOOP(.RC4RestLoop)
    725 
    726 .rc4Complete:
    727 	{
    728 		.mmi
    729 		add		KTable = -SZ, KTable
    730 		add		IFinal = -1, IFinal
    731 		mov		ar.lc = LCSave
    732 	} ;;
    733 	{
    734 		.mii
    735 		SKEY		[KTable] = J,-SZ
    736 		zxt1		IFinal = IFinal
    737 		mov		pr = PRSave, 0x1FFFF
    738 	} ;;
    739 	{
    740 		.mib
    741 		SKEY		[KTable] = IFinal
    742 		add		RetVal = 1, r0
    743 		br.ret.sptk.few	rp
    744 	} ;;
    745 ___
    746 
    747 # Last but not least, emit the code for the bypass-code of the unrolled loop:
    748 
    749 $code.=$bypass;
    750 
    751 $code.=<<___;
    752 	.endp RC4
    753 ___
    754 
    755 print $code;
    756