1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by David Mosberger <David.Mosberger (at] acm.org> based on the 5 # Itanium optimized Crypto code which was released by HP Labs at 6 # http://www.hpl.hp.com/research/linux/crypto/. 7 # 8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P. 9 # 10 # Permission is hereby granted, free of charge, to any person obtaining 11 # a copy of this software and associated documentation files (the 12 # "Software"), to deal in the Software without restriction, including 13 # without limitation the rights to use, copy, modify, merge, publish, 14 # distribute, sublicense, and/or sell copies of the Software, and to 15 # permit persons to whom the Software is furnished to do so, subject to 16 # the following conditions: 17 # 18 # The above copyright notice and this permission notice shall be 19 # included in all copies or substantial portions of the Software. 20 21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 28 29 30 31 # This is a little helper program which generates a software-pipelined 32 # for RC4 encryption. The basic algorithm looks like this: 33 # 34 # for (counter = 0; counter < len; ++counter) 35 # { 36 # in = inp[counter]; 37 # SI = S[I]; 38 # J = (SI + J) & 0xff; 39 # SJ = S[J]; 40 # T = (SI + SJ) & 0xff; 41 # S[I] = SJ, S[J] = SI; 42 # ST = S[T]; 43 # outp[counter] = in ^ ST; 44 # I = (I + 1) & 0xff; 45 # } 46 # 47 # Pipelining this loop isn't easy, because the stores to the S[] array 48 # need to be observed in the right order. The loop generated by the 49 # code below has the following pipeline diagram: 50 # 51 # cycle 52 # | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 | 53 # iter 54 # 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx 55 # 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx 56 # 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx 57 # 58 # where: 59 # LDI = load of S[I] 60 # LDJ = load of S[J] 61 # SWP = swap of S[I] and S[J] 62 # LDT = load of S[T] 63 # 64 # Note that in the above diagram, the major trouble-spot is that LDI 65 # of the 2nd iteration is performed BEFORE the SWP of the first 66 # iteration. Fortunately, this is easy to detect (I of the 1st 67 # iteration will be equal to J of the 2nd iteration) and when this 68 # happens, we simply forward the proper value from the 1st iteration 69 # to the 2nd one. The proper value in this case is simply the value 70 # of S[I] from the first iteration (thanks to the fact that SWP 71 # simply swaps the contents of S[I] and S[J]). 72 # 73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st 74 # iteration issues at the same time as the LDI of the 3rd iteration. 75 # However, thanks to IA-64 execution semantics, this can be taken 76 # care of simply by placing LDI later in the instruction-group than 77 # SWP. IA-64 CPUs will automatically forward the value if they 78 # detect that the SWP and LDI are accessing the same memory-location. 79 80 # The core-loop that can be pipelined then looks like this (annotated 81 # with McKinley/Madison issue port & latency numbers, assuming L1 82 # cache hits for the most part): 83 84 # operation: instruction: issue-ports: latency 85 # ------------------ ----------------------------- ------------- ------- 86 87 # Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0 88 # shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc 89 # I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc 90 # ;; 91 # SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP! 92 # ;; 93 # cmp.eq.unc pBypass = I, J * after J is valid! 94 # J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2 95 # (pBypass) br.cond.spnt Bypass 96 # ;; 97 # --------------------------------------------------------------------------------------- 98 # J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3 99 # ;; 100 # shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4 101 # ;; 102 # SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5 103 # ;; 104 # --------------------------------------------------------------------------------------- 105 # T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6 106 # ;; 107 # T = T & 0xff zxt1 T = T I0, I1 1 cyc 108 # S[I] = SJ st8 [Iptr] = SJ M2-M3 c7 109 # S[J] = SI st8 [Jptr] = SI M2-M3 110 # ;; 111 # shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8 112 # ;; 113 # --------------------------------------------------------------------------------------- 114 # T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9 115 # ;; 116 # data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10 117 # ;; 118 # *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11 119 # ;; 120 # --------------------------------------------------------------------------------------- 121 122 # There are several points worth making here: 123 124 # - Note that due to the bypass/forwarding-path, the first two 125 # phases of the loop are strangly mingled together. In 126 # particular, note that the first stage of the pipeline is 127 # using the value of "J", as calculated by the second stage. 128 # - Each bundle-pair will have exactly 6 instructions. 129 # - Pipelined, the loop can execute in 3 cycles/iteration and 130 # 4 stages. However, McKinley/Madison can issue "st1" to 131 # the same bank at a rate of at most one per 4 cycles. Thus, 132 # instead of storing each byte, we accumulate them in a word 133 # and then write them back at once with a single "st8" (this 134 # implies that the setup code needs to ensure that the output 135 # buffer is properly aligned, if need be, by encoding the 136 # first few bytes separately). 137 # - There is no space for a "br.ctop" instruction. For this 138 # reason we can't use module-loop support in IA-64 and have 139 # to do a traditional, purely software-pipelined loop. 140 # - We can't replace any of the remaining "add/zxt1" pairs with 141 # "padd1" because the latency for that instruction is too high 142 # and would push the loop to the point where more bypasses 143 # would be needed, which we don't have space for. 144 # - The above loop runs at around 3.26 cycles/byte, or roughly 145 # 440 MByte/sec on a 1.5GHz Madison. This is well below the 146 # system bus bandwidth and hence with judicious use of 147 # "lfetch" this loop can run at (almost) peak speed even when 148 # the input and output data reside in memory. The 149 # max. latency that can be tolerated is (PREFETCH_DISTANCE * 150 # L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at 151 # least) 1-ahead prefetching of 128 byte cache-lines. Note 152 # that we do NOT prefetch into L1, since that would only 153 # interfere with the S[] table values stored there. This is 154 # acceptable because there is a 10 cycle latency between 155 # load and first use of the input data. 156 # - We use a branch to out-of-line bypass-code of cycle-pressure: 157 # we calculate the next J, check for the need to activate the 158 # bypass path, and activate the bypass path ALL IN THE SAME 159 # CYCLE. If we didn't have these constraints, we could do 160 # the bypass with a simple conditional move instruction. 161 # Fortunately, the bypass paths get activated relatively 162 # infrequently, so the extra branches don't cost all that much 163 # (about 0.04 cycles/byte, measured on a 16396 byte file with 164 # random input data). 165 # 166 167 $phases = 4; # number of stages/phases in the pipelined-loop 168 $unroll_count = 6; # number of times we unrolled it 169 $pComI = (1 << 0); 170 $pComJ = (1 << 1); 171 $pComT = (1 << 2); 172 $pOut = (1 << 3); 173 174 $NData = 4; 175 $NIP = 3; 176 $NJP = 2; 177 $NI = 2; 178 $NSI = 3; 179 $NSJ = 2; 180 $NT = 2; 181 $NOutWord = 2; 182 183 # 184 # $threshold is the minimum length before we attempt to use the 185 # big software-pipelined loop. It MUST be greater-or-equal 186 # to: 187 # PHASES * (UNROLL_COUNT + 1) + 7 188 # 189 # The "+ 7" comes from the fact we may have to encode up to 190 # 7 bytes separately before the output pointer is aligned. 191 # 192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7); 193 194 sub I { 195 local *code = shift; 196 local $format = shift; 197 $code .= sprintf ("\t\t".$format."\n", @_); 198 } 199 200 sub P { 201 local *code = shift; 202 local $format = shift; 203 $code .= sprintf ($format."\n", @_); 204 } 205 206 sub STOP { 207 local *code = shift; 208 $code .=<<___; 209 ;; 210 ___ 211 } 212 213 sub emit_body { 214 local *c = shift; 215 local *bypass = shift; 216 local ($iteration, $p) = @_; 217 218 local $i0 = $iteration; 219 local $i1 = $iteration - 1; 220 local $i2 = $iteration - 2; 221 local $i3 = $iteration - 3; 222 local $iw0 = ($iteration - 3) / 8; 223 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1; 224 local $byte_num = ($iteration - 3) % 8; 225 local $label = $iteration + 1; 226 local $pAny = ($p & 0xf) == 0xf; 227 local $pByp = (($p & $pComI) && ($iteration > 0)); 228 229 $c.=<<___; 230 ////////////////////////////////////////////////// 231 ___ 232 233 if (($p & 0xf) == 0) { 234 $c.="#ifdef HOST_IS_BIG_ENDIAN\n"; 235 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;", 236 $iw1 % $NOutWord, $iw1 % $NOutWord); 237 $c.="#endif\n"; 238 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord); 239 return; 240 } 241 242 # Cycle 0 243 &I(\$c, "{ .mmi") if ($pAny); 244 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI); 245 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI); 246 &I(\$c, "zxt1 J = J") if ($p & $pComJ); 247 &I(\$c, "}") if ($pAny); 248 &I(\$c, "{ .mmi") if ($pAny); 249 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut); 250 &I(\$c, "add T[%u] = SI[%u], SJ[%u]", 251 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT); 252 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI); 253 &I(\$c, "}") if ($pAny); 254 &STOP(\$c); 255 256 # Cycle 1 257 &I(\$c, "{ .mmi") if ($pAny); 258 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT); 259 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT); 260 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT); 261 &I(\$c, "}") if ($pAny); 262 &I(\$c, "{ .mmi") if ($pAny); 263 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI); 264 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ); 265 &I(\$c, "xor Data[%u] = Data[%u], T[%u]", 266 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut); 267 &I(\$c, "}") if ($pAny); 268 &STOP(\$c); 269 270 # Cycle 2 271 &I(\$c, "{ .mmi") if ($pAny); 272 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ); 273 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp); 274 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8", 275 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut); 276 &I(\$c, "}") if ($pAny); 277 &I(\$c, "{ .mmb") if ($pAny); 278 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI); 279 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT); 280 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp); 281 &I(\$c, "}") if ($pAny); 282 &STOP(\$c); 283 284 &P(\$c, ".rc4Resume%u:", $label) if ($pByp); 285 if ($byte_num == 0 && $iteration >= $phases) { 286 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8", 287 $iw1 % $NOutWord) if ($p & $pOut); 288 if ($iteration == (1 + $unroll_count) * $phases - 1) { 289 if ($unroll_count == 6) { 290 &I(\$c, "mov OutWord[%u] = OutWord[%u]", 291 $iw1 % $NOutWord, $iw0 % $NOutWord); 292 } 293 &I(\$c, "lfetch.nt1 [InPrefetch], %u", 294 $unroll_count * $phases); 295 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u", 296 $unroll_count * $phases); 297 &I(\$c, "br.cloop.sptk.few .rc4Loop"); 298 } 299 } 300 301 if ($pByp) { 302 &P(\$bypass, ".rc4Bypass%u:", $label); 303 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI); 304 &I(\$bypass, "nop 0"); 305 &I(\$bypass, "nop 0"); 306 &I(\$bypass, ";;"); 307 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI); 308 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI); 309 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label); 310 &I(\$bypass, ";;"); 311 } 312 } 313 314 $code=<<___; 315 .ident \"rc4-ia64.s, version 3.0\" 316 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\" 317 318 #define LCSave r8 319 #define PRSave r9 320 321 /* Inputs become invalid once rotation begins! */ 322 323 #define StateTable in0 324 #define DataLen in1 325 #define InputBuffer in2 326 #define OutputBuffer in3 327 328 #define KTable r14 329 #define J r15 330 #define InPtr r16 331 #define OutPtr r17 332 #define InPrefetch r18 333 #define OutPrefetch r19 334 #define One r20 335 #define LoopCount r21 336 #define Remainder r22 337 #define IFinal r23 338 #define EndPtr r24 339 340 #define tmp0 r25 341 #define tmp1 r26 342 343 #define pBypass p6 344 #define pDone p7 345 #define pSmall p8 346 #define pAligned p9 347 #define pUnaligned p10 348 349 #define pComputeI pPhase[0] 350 #define pComputeJ pPhase[1] 351 #define pComputeT pPhase[2] 352 #define pOutput pPhase[3] 353 354 #define RetVal r8 355 #define L_OK p7 356 #define L_NOK p8 357 358 #define _NINPUTS 4 359 #define _NOUTPUT 0 360 361 #define _NROTATE 24 362 #define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT) 363 364 #ifndef SZ 365 # define SZ 4 // this must be set to sizeof(RC4_INT) 366 #endif 367 368 #if SZ == 1 369 # define LKEY ld1 370 # define SKEY st1 371 # define KEYADDR(dst, i) add dst = i, KTable 372 #elif SZ == 2 373 # define LKEY ld2 374 # define SKEY st2 375 # define KEYADDR(dst, i) shladd dst = i, 1, KTable 376 #elif SZ == 4 377 # define LKEY ld4 378 # define SKEY st4 379 # define KEYADDR(dst, i) shladd dst = i, 2, KTable 380 #else 381 # define LKEY ld8 382 # define SKEY st8 383 # define KEYADDR(dst, i) shladd dst = i, 3, KTable 384 #endif 385 386 #if defined(_HPUX_SOURCE) && !defined(_LP64) 387 # define ADDP addp4 388 #else 389 # define ADDP add 390 #endif 391 392 /* Define a macro for the bit number of the n-th byte: */ 393 394 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN) 395 # define HOST_IS_BIG_ENDIAN 396 # define BYTE_POS(n) (56 - (8 * (n))) 397 #else 398 # define BYTE_POS(n) (8 * (n)) 399 #endif 400 401 /* 402 We must perform the first phase of the pipeline explicitly since 403 we will always load from the stable the first time. The br.cexit 404 will never be taken since regardless of the number of bytes because 405 the epilogue count is 4. 406 */ 407 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX 408 assembler failed on original macro with syntax error. <appro> */ 409 #define MODSCHED_RC4_PROLOGUE \\ 410 { \\ 411 ld1 Data[0] = [InPtr], 1; \\ 412 add IFinal = 1, I[1]; \\ 413 KEYADDR(IPr[0], I[1]); \\ 414 } ;; \\ 415 { \\ 416 LKEY SI[0] = [IPr[0]]; \\ 417 mov pr.rot = 0x10000; \\ 418 mov ar.ec = 4; \\ 419 } ;; \\ 420 { \\ 421 add J = J, SI[0]; \\ 422 zxt1 I[0] = IFinal; \\ 423 br.cexit.spnt.few .+16; /* never taken */ \\ 424 } ;; 425 #define MODSCHED_RC4_LOOP(label) \\ 426 label: \\ 427 { .mmi; \\ 428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\ 429 (pComputeI) add IFinal = 1, I[1]; \\ 430 (pComputeJ) zxt1 J = J; \\ 431 }{ .mmi; \\ 432 (pOutput) LKEY T[1] = [T[1]]; \\ 433 (pComputeT) add T[0] = SI[2], SJ[1]; \\ 434 (pComputeI) KEYADDR(IPr[0], I[1]); \\ 435 } ;; \\ 436 { .mmi; \\ 437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\ 438 (pComputeT) SKEY [JP[1]] = SI[2]; \\ 439 (pComputeT) zxt1 T[0] = T[0]; \\ 440 }{ .mmi; \\ 441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\ 442 (pComputeJ) KEYADDR(JP[0], J); \\ 443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\ 444 } ;; \\ 445 { .mmi; \\ 446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\ 447 (pOutput) xor Data[3] = Data[3], T[1]; \\ 448 nop 0x0; \\ 449 }{ .mmi; \\ 450 (pComputeT) KEYADDR(T[0], T[0]); \\ 451 (pBypass) mov SI[0] = SI[1]; \\ 452 (pComputeI) zxt1 I[0] = IFinal; \\ 453 } ;; \\ 454 { .mmb; \\ 455 (pOutput) st1 [OutPtr] = Data[3], 1; \\ 456 (pComputeI) add J = J, SI[0]; \\ 457 br.ctop.sptk.few label; \\ 458 } ;; 459 460 .text 461 462 .align 32 463 464 .type RC4, \@function 465 .global RC4 466 467 .proc RC4 468 .prologue 469 470 RC4: 471 { 472 .mmi 473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE 474 475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\ 476 OutWord[2] 477 .rotp pPhase[4] 478 479 ADDP InPrefetch = 0, InputBuffer 480 ADDP KTable = 0, StateTable 481 } 482 { 483 .mmi 484 ADDP InPtr = 0, InputBuffer 485 ADDP OutPtr = 0, OutputBuffer 486 mov RetVal = r0 487 } 488 ;; 489 { 490 .mmi 491 lfetch.nt1 [InPrefetch], 0x80 492 ADDP OutPrefetch = 0, OutputBuffer 493 } 494 { // Return 0 if the input length is nonsensical 495 .mib 496 ADDP StateTable = 0, StateTable 497 cmp.ge.unc L_NOK, L_OK = r0, DataLen 498 (L_NOK) br.ret.sptk.few rp 499 } 500 ;; 501 { 502 .mib 503 cmp.eq.or L_NOK, L_OK = r0, InPtr 504 cmp.eq.or L_NOK, L_OK = r0, OutPtr 505 nop 0x0 506 } 507 { 508 .mib 509 cmp.eq.or L_NOK, L_OK = r0, StateTable 510 nop 0x0 511 (L_NOK) br.ret.sptk.few rp 512 } 513 ;; 514 LKEY I[1] = [KTable], SZ 515 /* Prefetch the state-table. It contains 256 elements of size SZ */ 516 517 #if SZ == 1 518 ADDP tmp0 = 1*128, StateTable 519 #elif SZ == 2 520 ADDP tmp0 = 3*128, StateTable 521 ADDP tmp1 = 2*128, StateTable 522 #elif SZ == 4 523 ADDP tmp0 = 7*128, StateTable 524 ADDP tmp1 = 6*128, StateTable 525 #elif SZ == 8 526 ADDP tmp0 = 15*128, StateTable 527 ADDP tmp1 = 14*128, StateTable 528 #endif 529 ;; 530 #if SZ >= 8 531 lfetch.fault.nt1 [tmp0], -256 // 15 532 lfetch.fault.nt1 [tmp1], -256;; 533 lfetch.fault.nt1 [tmp0], -256 // 13 534 lfetch.fault.nt1 [tmp1], -256;; 535 lfetch.fault.nt1 [tmp0], -256 // 11 536 lfetch.fault.nt1 [tmp1], -256;; 537 lfetch.fault.nt1 [tmp0], -256 // 9 538 lfetch.fault.nt1 [tmp1], -256;; 539 #endif 540 #if SZ >= 4 541 lfetch.fault.nt1 [tmp0], -256 // 7 542 lfetch.fault.nt1 [tmp1], -256;; 543 lfetch.fault.nt1 [tmp0], -256 // 5 544 lfetch.fault.nt1 [tmp1], -256;; 545 #endif 546 #if SZ >= 2 547 lfetch.fault.nt1 [tmp0], -256 // 3 548 lfetch.fault.nt1 [tmp1], -256;; 549 #endif 550 { 551 .mii 552 lfetch.fault.nt1 [tmp0] // 1 553 add I[1]=1,I[1];; 554 zxt1 I[1]=I[1] 555 } 556 { 557 .mmi 558 lfetch.nt1 [InPrefetch], 0x80 559 lfetch.excl.nt1 [OutPrefetch], 0x80 560 .save pr, PRSave 561 mov PRSave = pr 562 } ;; 563 { 564 .mmi 565 lfetch.excl.nt1 [OutPrefetch], 0x80 566 LKEY J = [KTable], SZ 567 ADDP EndPtr = DataLen, InPtr 568 } ;; 569 { 570 .mmi 571 ADDP EndPtr = -1, EndPtr // Make it point to 572 // last data byte. 573 mov One = 1 574 .save ar.lc, LCSave 575 mov LCSave = ar.lc 576 .body 577 } ;; 578 { 579 .mmb 580 sub Remainder = 0, OutPtr 581 cmp.gtu pSmall, p0 = $threshold, DataLen 582 (pSmall) br.cond.dpnt .rc4Remainder // Data too small for 583 // big loop. 584 } ;; 585 { 586 .mmi 587 and Remainder = 0x7, Remainder 588 ;; 589 cmp.eq pAligned, pUnaligned = Remainder, r0 590 nop 0x0 591 } ;; 592 { 593 .mmb 594 .pred.rel "mutex",pUnaligned,pAligned 595 (pUnaligned) add Remainder = -1, Remainder 596 (pAligned) sub Remainder = EndPtr, InPtr 597 (pAligned) br.cond.dptk.many .rc4Aligned 598 } ;; 599 { 600 .mmi 601 nop 0x0 602 nop 0x0 603 mov.i ar.lc = Remainder 604 } 605 606 /* Do the initial few bytes via the compact, modulo-scheduled loop 607 until the output pointer is 8-byte-aligned. */ 608 609 MODSCHED_RC4_PROLOGUE 610 MODSCHED_RC4_LOOP(.RC4AlignLoop) 611 612 { 613 .mib 614 sub Remainder = EndPtr, InPtr 615 zxt1 IFinal = IFinal 616 clrrrb // Clear CFM.rrb.pr so 617 ;; // next "mov pr.rot = N" 618 // does the right thing. 619 } 620 { 621 .mmi 622 mov I[1] = IFinal 623 nop 0x0 624 nop 0x0 625 } ;; 626 627 628 .rc4Aligned: 629 630 /* 631 Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases) 632 */ 633 634 { 635 .mlx 636 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder 637 movl Remainder = 0xaaaaaaaaaaaaaaab 638 } ;; 639 { 640 .mmi 641 setf.sig f6 = LoopCount // M2, M3 6 cyc 642 setf.sig f7 = Remainder // M2, M3 6 cyc 643 nop 0x0 644 } ;; 645 { 646 .mfb 647 nop 0x0 648 xmpy.hu f6 = f6, f7 649 nop 0x0 650 } ;; 651 { 652 .mmi 653 getf.sig LoopCount = f6;; // M2 5 cyc 654 nop 0x0 655 shr.u LoopCount = LoopCount, 4 656 } ;; 657 { 658 .mmi 659 nop 0x0 660 nop 0x0 661 mov.i ar.lc = LoopCount 662 } ;; 663 664 /* Now comes the unrolled loop: */ 665 666 .rc4Prologue: 667 ___ 668 669 $iteration = 0; 670 671 # Generate the prologue: 672 $predicates = 1; 673 for ($i = 0; $i < $phases; ++$i) { 674 &emit_body (\$code, \$bypass, $iteration++, $predicates); 675 $predicates = ($predicates << 1) | 1; 676 } 677 678 $code.=<<___; 679 .rc4Loop: 680 ___ 681 682 # Generate the body: 683 for ($i = 0; $i < $unroll_count*$phases; ++$i) { 684 &emit_body (\$code, \$bypass, $iteration++, $predicates); 685 } 686 687 $code.=<<___; 688 .rc4Epilogue: 689 ___ 690 691 # Generate the epilogue: 692 for ($i = 0; $i < $phases; ++$i) { 693 $predicates <<= 1; 694 &emit_body (\$code, \$bypass, $iteration++, $predicates); 695 } 696 697 $code.=<<___; 698 { 699 .mmi 700 lfetch.nt1 [EndPtr] // fetch line with last byte 701 mov IFinal = I[1] 702 nop 0x0 703 } 704 705 .rc4Remainder: 706 { 707 .mmi 708 sub Remainder = EndPtr, InPtr // Calculate 709 // # of bytes 710 // left - 1 711 nop 0x0 712 nop 0x0 713 } ;; 714 { 715 .mib 716 cmp.eq pDone, p0 = -1, Remainder // done already? 717 mov.i ar.lc = Remainder 718 (pDone) br.cond.dptk.few .rc4Complete 719 } 720 721 /* Do the remaining bytes via the compact, modulo-scheduled loop */ 722 723 MODSCHED_RC4_PROLOGUE 724 MODSCHED_RC4_LOOP(.RC4RestLoop) 725 726 .rc4Complete: 727 { 728 .mmi 729 add KTable = -SZ, KTable 730 add IFinal = -1, IFinal 731 mov ar.lc = LCSave 732 } ;; 733 { 734 .mii 735 SKEY [KTable] = J,-SZ 736 zxt1 IFinal = IFinal 737 mov pr = PRSave, 0x1FFFF 738 } ;; 739 { 740 .mib 741 SKEY [KTable] = IFinal 742 add RetVal = 1, r0 743 br.ret.sptk.few rp 744 } ;; 745 ___ 746 747 # Last but not least, emit the code for the bypass-code of the unrolled loop: 748 749 $code.=$bypass; 750 751 $code.=<<___; 752 .endp RC4 753 ___ 754 755 print $code; 756