Home | History | Annotate | Download | only in rc4
      1 // Original source:
      2 //	http://www.zorinaq.com/papers/rc4-amd64.html
      3 //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
      4 
      5 #include "textflag.h"
      6 
      7 // Local modifications:
      8 //
      9 // Transliterated from GNU to 6a assembly syntax by the Go authors.
     10 // The comments and spacing are from the original.
     11 //
     12 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
     13 //
     14 // The original code accumulated 64 bits of key stream in an integer
     15 // register and then XOR'ed the key stream into the data 8 bytes at a time.
     16 // Modified to accumulate 128 bits of key stream into an XMM register
     17 // and then XOR the key stream into the data 16 bytes at a time.
     18 // Approximately doubles throughput.
     19 //
     20 // Converted to amd64p32.
     21 //
     22 // To make safe for Native Client, avoid use of BP, R15,
     23 // and two-register addressing modes.
     24 
     25 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
     26 // but makes the code run 2.0x slower on Xeon.
     27 #define EXTEND(r) MOVBLZX r, r
     28 
     29 /*
     30 ** RC4 implementation optimized for AMD64.
     31 **
     32 ** Author: Marc Bevand <bevand_m (at) epita.fr>
     33 ** Licence: I hereby disclaim the copyright on this code and place it
     34 ** in the public domain.
     35 **
     36 ** The code has been designed to be easily integrated into openssl:
     37 ** the exported RC4() function can replace the actual implementations
     38 ** openssl already contains. Please note that when linking with openssl,
     39 ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
     40 ** with -DRC4_INT='unsigned long'.
     41 **
     42 ** The throughput achieved by this code is about 320 MBytes/sec, on
     43 ** a 1.8 GHz AMD Opteron (rev C0) processor.
     44 */
     45 
     46 TEXT xorKeyStream(SB),NOSPLIT,$0
     47 	MOVL	n+8(FP),	BX		// rbx = ARG(len)
     48 	MOVL	src+4(FP),	SI		// in = ARG(in)
     49 	MOVL	dst+0(FP),	DI		// out = ARG(out)
     50 	MOVL	state+12(FP),	R10		// d = ARG(data)
     51 	MOVL	i+16(FP),	AX
     52 	MOVBQZX	0(AX),		CX		// x = *xp
     53 	MOVL	j+20(FP),	AX
     54 	MOVBQZX	0(AX),		DX		// y = *yp
     55 
     56 	LEAQ	(SI)(BX*1),	R9		// limit = in+len
     57 
     58 l1:	CMPQ	SI,		R9		// cmp in with in+len
     59 	JGE	finished			// jump if (in >= in+len)
     60 
     61 	INCB	CX
     62 	EXTEND(CX)
     63 	TESTL	$15,		CX
     64 	JZ	wordloop
     65 	LEAL	(R10)(CX*4), R12
     66 
     67 	MOVBLZX	(R12),	AX
     68 
     69 	ADDB	AX,		DX		// y += tx
     70 	EXTEND(DX)
     71 	LEAL (R10)(DX*4), R11
     72 	MOVBLZX	(R11),	BX		// ty = d[y]
     73 	MOVB	BX,		(R12)	// d[x] = ty
     74 	ADDB	AX,		BX		// val = ty+tx
     75 	EXTEND(BX)
     76 	LEAL (R10)(BX*4), R13
     77 	MOVB	AX,		(R11)	// d[y] = tx
     78 	MOVBLZX	(R13),	R8		// val = d[val]
     79 	XORB	(SI),		R8		// xor 1 byte
     80 	MOVB	R8,		(DI)
     81 	INCQ	SI				// in++
     82 	INCQ	DI				// out++
     83 	JMP l1
     84 
     85 wordloop:
     86 	SUBQ	$16,		R9
     87 	CMPQ	SI,		R9
     88 	JGT	end
     89 
     90 start:
     91 	ADDQ	$16,		SI		// increment in
     92 	ADDQ	$16,		DI		// increment out
     93 
     94 	// Each KEYROUND generates one byte of key and
     95 	// inserts it into an XMM register at the given 16-bit index.
     96 	// The key state array is uint32 words only using the bottom
     97 	// byte of each word, so the 16-bit OR only copies 8 useful bits.
     98 	// We accumulate alternating bytes into X0 and X1, and then at
     99 	// the end we OR X1<<8 into X0 to produce the actual key.
    100 	//
    101 	// At the beginning of the loop, CX%16 == 0, so the 16 loads
    102 	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
    103 	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
    104 	// without fear of the byte computation CX+15 wrapping around.
    105 	//
    106 	// The first round needs R12[0], the second needs R12[1], and so on.
    107 	// We can avoid memory stalls by starting the load for round n+1
    108 	// before the end of round n, using the LOAD macro.
    109 	LEAQ	(R10)(CX*4),	R12
    110 
    111 #define KEYROUND(xmm, load, off, r1, r2, index) \
    112 	LEAL (R10)(DX*4), R11; \
    113 	MOVBLZX	(R11),	R8; \
    114 	MOVB	r1,		(R11); \
    115 	load((off+1), r2); \
    116 	MOVB	R8,		(off*4)(R12); \
    117 	ADDB	r1,		R8; \
    118 	EXTEND(R8); \
    119 	LEAL (R10)(R8*4), R14; \
    120 	PINSRW	$index, (R14), xmm
    121 
    122 #define LOAD(off, reg) \
    123 	MOVBLZX	(off*4)(R12),	reg; \
    124 	ADDB	reg,		DX; \
    125 	EXTEND(DX)
    126 
    127 #define SKIP(off, reg)
    128 
    129 	LOAD(0, AX)
    130 	KEYROUND(X0, LOAD, 0, AX, BX, 0)
    131 	KEYROUND(X1, LOAD, 1, BX, AX, 0)
    132 	KEYROUND(X0, LOAD, 2, AX, BX, 1)
    133 	KEYROUND(X1, LOAD, 3, BX, AX, 1)
    134 	KEYROUND(X0, LOAD, 4, AX, BX, 2)
    135 	KEYROUND(X1, LOAD, 5, BX, AX, 2)
    136 	KEYROUND(X0, LOAD, 6, AX, BX, 3)
    137 	KEYROUND(X1, LOAD, 7, BX, AX, 3)
    138 	KEYROUND(X0, LOAD, 8, AX, BX, 4)
    139 	KEYROUND(X1, LOAD, 9, BX, AX, 4)
    140 	KEYROUND(X0, LOAD, 10, AX, BX, 5)
    141 	KEYROUND(X1, LOAD, 11, BX, AX, 5)
    142 	KEYROUND(X0, LOAD, 12, AX, BX, 6)
    143 	KEYROUND(X1, LOAD, 13, BX, AX, 6)
    144 	KEYROUND(X0, LOAD, 14, AX, BX, 7)
    145 	KEYROUND(X1, SKIP, 15, BX, AX, 7)
    146 
    147 	ADDB	$16,		CX
    148 
    149 	PSLLQ	$8,		X1
    150 	PXOR	X1,		X0
    151 	MOVOU	-16(SI),	X2
    152 	PXOR	X0,		X2
    153 	MOVOU	X2,		-16(DI)
    154 
    155 	CMPQ	SI,		R9		// cmp in with in+len-16
    156 	JLE	start				// jump if (in <= in+len-16)
    157 
    158 end:
    159 	DECB	CX
    160 	ADDQ	$16,		R9		// tmp = in+len
    161 
    162 	// handle the last bytes, one by one
    163 l2:	CMPQ	SI,		R9		// cmp in with in+len
    164 	JGE	finished			// jump if (in >= in+len)
    165 
    166 	INCB	CX
    167 	EXTEND(CX)
    168 	LEAL (R10)(CX*4), R12
    169 	MOVBLZX	(R12),	AX
    170 
    171 	ADDB	AX,		DX		// y += tx
    172 	EXTEND(DX)
    173 	LEAL (R10)(DX*4), R11
    174 	MOVBLZX	(R11),	BX		// ty = d[y]
    175 	MOVB	BX,		(R12)	// d[x] = ty
    176 	ADDB	AX,		BX		// val = ty+tx
    177 	EXTEND(BX)
    178 	LEAL (R10)(BX*4), R13
    179 	MOVB	AX,		(R11)	// d[y] = tx
    180 	MOVBLZX	(R13),	R8		// val = d[val]
    181 	XORB	(SI),		R8		// xor 1 byte
    182 	MOVB	R8,		(DI)
    183 	INCQ	SI				// in++
    184 	INCQ	DI				// out++
    185 	JMP l2
    186 
    187 finished:
    188 	MOVL	j+20(FP),	BX
    189 	MOVB	DX, 0(BX)
    190 	MOVL	i+16(FP),	AX
    191 	MOVB	CX, 0(AX)
    192 	RET
    193