Home | History | Annotate | Download | only in rc4
      1 // Original source:
      2 //	http://www.zorinaq.com/papers/rc4-amd64.html
      3 //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
      4 
      5 #include "textflag.h"
      6 
      7 // Local modifications:
      8 //
      9 // Transliterated from GNU to 6a assembly syntax by the Go authors.
     10 // The comments and spacing are from the original.
     11 //
     12 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
     13 //
     14 // The original code accumulated 64 bits of key stream in an integer
     15 // register and then XOR'ed the key stream into the data 8 bytes at a time.
     16 // Modified to accumulate 128 bits of key stream into an XMM register
     17 // and then XOR the key stream into the data 16 bytes at a time.
     18 // Approximately doubles throughput.
     19 
     20 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
     21 // but makes the code run 2.0x slower on Xeon.
     22 #define EXTEND(r) MOVBLZX r, r
     23 
     24 /*
     25 ** RC4 implementation optimized for AMD64.
     26 **
     27 ** Author: Marc Bevand <bevand_m (at) epita.fr>
     28 ** Licence: I hereby disclaim the copyright on this code and place it
     29 ** in the public domain.
     30 **
     31 ** The code has been designed to be easily integrated into openssl:
     32 ** the exported RC4() function can replace the actual implementations
     33 ** openssl already contains. Please note that when linking with openssl,
     34 ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
     35 ** with -DRC4_INT='unsigned long'.
     36 **
     37 ** The throughput achieved by this code is about 320 MBytes/sec, on
     38 ** a 1.8 GHz AMD Opteron (rev C0) processor.
     39 */
     40 
     41 TEXT xorKeyStream(SB),NOSPLIT,$0
     42 	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
     43 	MOVQ	src+8(FP),	SI		// in = ARG(in)
     44 	MOVQ	dst+0(FP),	DI		// out = ARG(out)
     45 	MOVQ	state+24(FP),	BP		// d = ARG(data)
     46 	MOVQ	i+32(FP),	AX
     47 	MOVBQZX	0(AX),		CX		// x = *xp
     48 	MOVQ	j+40(FP),	AX
     49 	MOVBQZX	0(AX),		DX		// y = *yp
     50 
     51 	LEAQ	(SI)(BX*1),	R9		// limit = in+len
     52 
     53 l1:	CMPQ	SI,		R9		// cmp in with in+len
     54 	JGE	finished			// jump if (in >= in+len)
     55 
     56 	INCB	CX
     57 	EXTEND(CX)
     58 	TESTL	$15,		CX
     59 	JZ	wordloop
     60 
     61 	MOVBLZX	(BP)(CX*4),	AX
     62 
     63 	ADDB	AX,		DX		// y += tx
     64 	EXTEND(DX)
     65 	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
     66 	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
     67 	ADDB	AX,		BX		// val = ty+tx
     68 	EXTEND(BX)
     69 	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
     70 	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
     71 	XORB	(SI),		R8		// xor 1 byte
     72 	MOVB	R8,		(DI)
     73 	INCQ	SI				// in++
     74 	INCQ	DI				// out++
     75 	JMP l1
     76 
     77 wordloop:
     78 	SUBQ	$16,		R9
     79 	CMPQ	SI,		R9
     80 	JGT	end
     81 
     82 start:
     83 	ADDQ	$16,		SI		// increment in
     84 	ADDQ	$16,		DI		// increment out
     85 
     86 	// Each KEYROUND generates one byte of key and
     87 	// inserts it into an XMM register at the given 16-bit index.
     88 	// The key state array is uint32 words only using the bottom
     89 	// byte of each word, so the 16-bit OR only copies 8 useful bits.
     90 	// We accumulate alternating bytes into X0 and X1, and then at
     91 	// the end we OR X1<<8 into X0 to produce the actual key.
     92 	//
     93 	// At the beginning of the loop, CX%16 == 0, so the 16 loads
     94 	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
     95 	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
     96 	// without fear of the byte computation CX+15 wrapping around.
     97 	//
     98 	// The first round needs R12[0], the second needs R12[1], and so on.
     99 	// We can avoid memory stalls by starting the load for round n+1
    100 	// before the end of round n, using the LOAD macro.
    101 	LEAQ	(BP)(CX*4),	R12
    102 
    103 #define KEYROUND(xmm, load, off, r1, r2, index) \
    104 	MOVBLZX	(BP)(DX*4),	R8; \
    105 	MOVB	r1,		(BP)(DX*4); \
    106 	load((off+1), r2); \
    107 	MOVB	R8,		(off*4)(R12); \
    108 	ADDB	r1,		R8; \
    109 	EXTEND(R8); \
    110 	PINSRW	$index, (BP)(R8*4), xmm
    111 
    112 #define LOAD(off, reg) \
    113 	MOVBLZX	(off*4)(R12),	reg; \
    114 	ADDB	reg,		DX; \
    115 	EXTEND(DX)
    116 
    117 #define SKIP(off, reg)
    118 
    119 	LOAD(0, AX)
    120 	KEYROUND(X0, LOAD, 0, AX, BX, 0)
    121 	KEYROUND(X1, LOAD, 1, BX, AX, 0)
    122 	KEYROUND(X0, LOAD, 2, AX, BX, 1)
    123 	KEYROUND(X1, LOAD, 3, BX, AX, 1)
    124 	KEYROUND(X0, LOAD, 4, AX, BX, 2)
    125 	KEYROUND(X1, LOAD, 5, BX, AX, 2)
    126 	KEYROUND(X0, LOAD, 6, AX, BX, 3)
    127 	KEYROUND(X1, LOAD, 7, BX, AX, 3)
    128 	KEYROUND(X0, LOAD, 8, AX, BX, 4)
    129 	KEYROUND(X1, LOAD, 9, BX, AX, 4)
    130 	KEYROUND(X0, LOAD, 10, AX, BX, 5)
    131 	KEYROUND(X1, LOAD, 11, BX, AX, 5)
    132 	KEYROUND(X0, LOAD, 12, AX, BX, 6)
    133 	KEYROUND(X1, LOAD, 13, BX, AX, 6)
    134 	KEYROUND(X0, LOAD, 14, AX, BX, 7)
    135 	KEYROUND(X1, SKIP, 15, BX, AX, 7)
    136 
    137 	ADDB	$16,		CX
    138 
    139 	PSLLQ	$8,		X1
    140 	PXOR	X1,		X0
    141 	MOVOU	-16(SI),	X2
    142 	PXOR	X0,		X2
    143 	MOVOU	X2,		-16(DI)
    144 
    145 	CMPQ	SI,		R9		// cmp in with in+len-16
    146 	JLE	start				// jump if (in <= in+len-16)
    147 
    148 end:
    149 	DECB	CX
    150 	ADDQ	$16,		R9		// tmp = in+len
    151 
    152 	// handle the last bytes, one by one
    153 l2:	CMPQ	SI,		R9		// cmp in with in+len
    154 	JGE	finished			// jump if (in >= in+len)
    155 
    156 	INCB	CX
    157 	EXTEND(CX)
    158 	MOVBLZX	(BP)(CX*4),	AX
    159 
    160 	ADDB	AX,		DX		// y += tx
    161 	EXTEND(DX)
    162 	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
    163 	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
    164 	ADDB	AX,		BX		// val = ty+tx
    165 	EXTEND(BX)
    166 	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
    167 	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
    168 	XORB	(SI),		R8		// xor 1 byte
    169 	MOVB	R8,		(DI)
    170 	INCQ	SI				// in++
    171 	INCQ	DI				// out++
    172 	JMP l2
    173 
    174 finished:
    175 	MOVQ	j+40(FP),	BX
    176 	MOVB	DX, 0(BX)
    177 	MOVQ	i+32(FP),	AX
    178 	MOVB	CX, 0(AX)
    179 	RET
    180