Home | History | Annotate | Download | only in sha512
      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "textflag.h"
      6 
      7 // SHA512 block routine. See sha512block.go for Go equivalent.
      8 //
      9 // The algorithm is detailed in FIPS 180-4:
     10 //
     11 //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
     12 //
     13 // Wt = Mt; for 0 <= t <= 15
     14 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
     15 //
     16 // a = H0
     17 // b = H1
     18 // c = H2
     19 // d = H3
     20 // e = H4
     21 // f = H5
     22 // g = H6
     23 // h = H7
     24 //
     25 // for t = 0 to 79 {
     26 //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
     27 //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
     28 //    h = g
     29 //    g = f
     30 //    f = e
     31 //    e = d + T1
     32 //    d = c
     33 //    c = b
     34 //    b = a
     35 //    a = T1 + T2
     36 // }
     37 //
     38 // H0 = a + H0
     39 // H1 = b + H1
     40 // H2 = c + H2
     41 // H3 = d + H3
     42 // H4 = e + H4
     43 // H5 = f + H5
     44 // H6 = g + H6
     45 // H7 = h + H7
     46 
     47 // Wt = Mt; for 0 <= t <= 15
     48 #define MSGSCHEDULE0(index) \
     49 	MOVQ	(index*8)(SI), AX; \
     50 	BSWAPQ	AX; \
     51 	MOVQ	AX, (index*8)(BP)
     52 
     53 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
     54 //   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
     55 //   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
     56 #define MSGSCHEDULE1(index) \
     57 	MOVQ	((index-2)*8)(BP), AX; \
     58 	MOVQ	AX, CX; \
     59 	RORQ	$19, AX; \
     60 	MOVQ	CX, DX; \
     61 	RORQ	$61, CX; \
     62 	SHRQ	$6, DX; \
     63 	MOVQ	((index-15)*8)(BP), BX; \
     64 	XORQ	CX, AX; \
     65 	MOVQ	BX, CX; \
     66 	XORQ	DX, AX; \
     67 	RORQ	$1, BX; \
     68 	MOVQ	CX, DX; \
     69 	SHRQ	$7, DX; \
     70 	RORQ	$8, CX; \
     71 	ADDQ	((index-7)*8)(BP), AX; \
     72 	XORQ	CX, BX; \
     73 	XORQ	DX, BX; \
     74 	ADDQ	((index-16)*8)(BP), BX; \
     75 	ADDQ	BX, AX; \
     76 	MOVQ	AX, ((index)*8)(BP)
     77 
     78 // Calculate T1 in AX - uses AX, CX and DX registers.
     79 // h is also used as an accumulator. Wt is passed in AX.
     80 //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
     81 //     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
     82 //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
     83 #define SHA512T1(const, e, f, g, h) \
     84 	MOVQ	$const, DX; \
     85 	ADDQ	AX, h; \
     86 	MOVQ	e, AX; \
     87 	ADDQ	DX, h; \
     88 	MOVQ	e, CX; \
     89 	RORQ	$14, AX; \
     90 	MOVQ	e, DX; \
     91 	RORQ	$18, CX; \
     92 	XORQ	CX, AX; \
     93 	MOVQ	e, CX; \
     94 	RORQ	$41, DX; \
     95 	ANDQ	f, CX; \
     96 	XORQ	AX, DX; \
     97 	MOVQ	e, AX; \
     98 	NOTQ	AX; \
     99 	ADDQ	DX, h; \
    100 	ANDQ	g, AX; \
    101 	XORQ	CX, AX; \
    102 	ADDQ	h, AX
    103 
    104 // Calculate T2 in BX - uses BX, CX, DX and DI registers.
    105 //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
    106 //     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
    107 //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
    108 #define SHA512T2(a, b, c) \
    109 	MOVQ	a, DI; \
    110 	MOVQ	c, BX; \
    111 	RORQ	$28, DI; \
    112 	MOVQ	a, DX; \
    113 	ANDQ	b, BX; \
    114 	RORQ	$34, DX; \
    115 	MOVQ	a, CX; \
    116 	ANDQ	c, CX; \
    117 	XORQ	DX, DI; \
    118 	XORQ	CX, BX; \
    119 	MOVQ	a, DX; \
    120 	MOVQ	b, CX; \
    121 	RORQ	$39, DX; \
    122 	ANDQ	a, CX; \
    123 	XORQ	CX, BX; \
    124 	XORQ	DX, DI; \
    125 	ADDQ	DI, BX
    126 
    127 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
    128 // The values for e and a are stored in d and h, ready for rotation.
    129 #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
    130 	SHA512T1(const, e, f, g, h); \
    131 	SHA512T2(a, b, c); \
    132 	MOVQ	BX, h; \
    133 	ADDQ	AX, d; \
    134 	ADDQ	AX, h
    135 
    136 #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
    137 	MSGSCHEDULE0(index); \
    138 	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
    139 
    140 #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
    141 	MSGSCHEDULE1(index); \
    142 	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
    143 
    144 TEXT blockAMD64(SB),0,$648-32
    145 	MOVQ	p_base+8(FP), SI
    146 	MOVQ	p_len+16(FP), DX
    147 	SHRQ	$7, DX
    148 	SHLQ	$7, DX
    149 
    150 	LEAQ	(SI)(DX*1), DI
    151 	MOVQ	DI, 640(SP)
    152 	CMPQ	SI, DI
    153 	JEQ	end
    154 
    155 	MOVQ	dig+0(FP), BP
    156 	MOVQ	(0*8)(BP), R8		// a = H0
    157 	MOVQ	(1*8)(BP), R9		// b = H1
    158 	MOVQ	(2*8)(BP), R10		// c = H2
    159 	MOVQ	(3*8)(BP), R11		// d = H3
    160 	MOVQ	(4*8)(BP), R12		// e = H4
    161 	MOVQ	(5*8)(BP), R13		// f = H5
    162 	MOVQ	(6*8)(BP), R14		// g = H6
    163 	MOVQ	(7*8)(BP), R15		// h = H7
    164 
    165 loop:
    166 	MOVQ	SP, BP			// message schedule
    167 
    168 	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
    169 	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
    170 	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
    171 	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
    172 	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
    173 	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
    174 	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
    175 	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
    176 	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
    177 	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
    178 	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
    179 	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
    180 	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
    181 	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
    182 	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
    183 	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
    184 
    185 	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
    186 	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
    187 	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
    188 	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
    189 	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
    190 	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
    191 	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
    192 	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
    193 	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
    194 	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
    195 	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
    196 	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
    197 	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
    198 	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
    199 	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
    200 	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
    201 	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
    202 	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
    203 	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
    204 	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
    205 	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
    206 	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
    207 	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
    208 	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
    209 	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
    210 	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
    211 	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
    212 	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
    213 	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
    214 	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
    215 	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
    216 	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
    217 	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
    218 	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
    219 	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
    220 	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
    221 	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
    222 	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
    223 	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
    224 	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
    225 	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
    226 	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
    227 	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
    228 	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
    229 	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
    230 	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
    231 	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
    232 	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
    233 	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
    234 	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
    235 	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
    236 	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
    237 	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
    238 	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
    239 	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
    240 	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
    241 	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
    242 	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
    243 	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
    244 	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
    245 	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
    246 	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
    247 	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
    248 	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
    249 
    250 	MOVQ	dig+0(FP), BP
    251 	ADDQ	(0*8)(BP), R8	// H0 = a + H0
    252 	MOVQ	R8, (0*8)(BP)
    253 	ADDQ	(1*8)(BP), R9	// H1 = b + H1
    254 	MOVQ	R9, (1*8)(BP)
    255 	ADDQ	(2*8)(BP), R10	// H2 = c + H2
    256 	MOVQ	R10, (2*8)(BP)
    257 	ADDQ	(3*8)(BP), R11	// H3 = d + H3
    258 	MOVQ	R11, (3*8)(BP)
    259 	ADDQ	(4*8)(BP), R12	// H4 = e + H4
    260 	MOVQ	R12, (4*8)(BP)
    261 	ADDQ	(5*8)(BP), R13	// H5 = f + H5
    262 	MOVQ	R13, (5*8)(BP)
    263 	ADDQ	(6*8)(BP), R14	// H6 = g + H6
    264 	MOVQ	R14, (6*8)(BP)
    265 	ADDQ	(7*8)(BP), R15	// H7 = h + H7
    266 	MOVQ	R15, (7*8)(BP)
    267 
    268 	ADDQ	$128, SI
    269 	CMPQ	SI, 640(SP)
    270 	JB	loop
    271 
    272 end:
    273 	RET
    274 
    275 // Version below is based on "Fast SHA512 Implementations on Intel
    276 // Architecture Processors" White-paper
    277 // http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
    278 // AVX2 version by Intel, same algorithm in Linux kernel:
    279 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
    280 
    281 // James Guilford <james.guilford (at) intel.com>
    282 // Kirk Yap <kirk.s.yap (at) intel.com>
    283 // Tim Chen <tim.c.chen (at) linux.intel.com>
    284 // David Cote <david.m.cote (at) intel.com>
    285 // Aleksey Sidorov <aleksey.sidorov (at) intel.com>
    286 
    287 #define YFER_SIZE (4*8)
    288 #define SRND_SIZE (1*8)
    289 #define INP_SIZE (1*8)
    290 
    291 #define frame_YFER (0)
    292 #define frame_SRND (frame_YFER + YFER_SIZE)
    293 #define frame_INP (frame_SRND + SRND_SIZE)
    294 #define frame_INPEND (frame_INP + INP_SIZE)
    295 
    296 #define addm(p1, p2) \
    297 	ADDQ p1, p2; \
    298 	MOVQ p2, p1
    299 
    300 #define COPY_YMM_AND_BSWAP(p1, p2, p3) \
    301 	VMOVDQU p2, p1;    \
    302 	VPSHUFB p3, p1, p1
    303 
    304 #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
    305 	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
    306 	VPALIGNR   $RVAL, YSRC2, YDST, YDST
    307 
    308 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
    309 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
    310 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
    311 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
    312 
    313 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
    314 
    315 DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
    316 DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
    317 DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
    318 DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
    319 
    320 GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
    321 
    322 TEXT blockAVX2(SB), NOSPLIT, $56-32
    323 	MOVQ dig+0(FP), SI
    324 	MOVQ p_base+8(FP), DI
    325 	MOVQ p_len+16(FP), DX
    326 
    327 	SHRQ $7, DX
    328 	SHLQ $7, DX
    329 
    330 	JZ   done_hash
    331 	ADDQ DI, DX
    332 	MOVQ DX, frame_INPEND(SP)
    333 
    334 	MOVQ (0*8)(SI), AX
    335 	MOVQ (1*8)(SI), BX
    336 	MOVQ (2*8)(SI), CX
    337 	MOVQ (3*8)(SI), R8
    338 	MOVQ (4*8)(SI), DX
    339 	MOVQ (5*8)(SI), R9
    340 	MOVQ (6*8)(SI), R10
    341 	MOVQ (7*8)(SI), R11
    342 
    343 	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
    344 
    345 loop0:
    346 	MOVQ _K+0(SB), BP
    347 
    348 	// byte swap first 16 dwords
    349 	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
    350 	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
    351 	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
    352 	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
    353 
    354 	MOVQ DI, frame_INP(SP)
    355 
    356 	// schedule 64 input dwords, by doing 12 rounds of 4 each
    357 	MOVQ $4, frame_SRND(SP)
    358 
    359 loop1:
    360 	VPADDQ  (BP), Y4, Y0
    361 	VMOVDQU Y0, frame_YFER(SP)
    362 
    363 	MY_VPALIGNR(Y0, Y7, Y6, 8)
    364 
    365 	VPADDQ Y4, Y0, Y0
    366 
    367 	MY_VPALIGNR(Y1, Y5, Y4, 8)
    368 
    369 	VPSRLQ $1, Y1, Y2
    370 	VPSLLQ $(64-1), Y1, Y3
    371 	VPOR   Y2, Y3, Y3
    372 
    373 	VPSRLQ $7, Y1, Y8
    374 
    375 	MOVQ  AX, DI
    376 	RORXQ $41, DX, R13
    377 	RORXQ $18, DX, R14
    378 	ADDQ  frame_YFER(SP), R11
    379 	ORQ   CX, DI
    380 	MOVQ  R9, R15
    381 	RORXQ $34, AX, R12
    382 
    383 	XORQ  R14, R13
    384 	XORQ  R10, R15
    385 	RORXQ $14, DX, R14
    386 
    387 	ANDQ  DX, R15
    388 	XORQ  R14, R13
    389 	RORXQ $39, AX, R14
    390 	ADDQ  R11, R8
    391 
    392 	ANDQ  BX, DI
    393 	XORQ  R12, R14
    394 	RORXQ $28, AX, R12
    395 
    396 	XORQ R10, R15
    397 	XORQ R12, R14
    398 	MOVQ AX, R12
    399 	ANDQ CX, R12
    400 
    401 	ADDQ R13, R15
    402 	ORQ  R12, DI
    403 	ADDQ R14, R11
    404 
    405 	ADDQ R15, R8
    406 
    407 	ADDQ R15, R11
    408 	ADDQ DI, R11
    409 
    410 	VPSRLQ $8, Y1, Y2
    411 	VPSLLQ $(64-8), Y1, Y1
    412 	VPOR   Y2, Y1, Y1
    413 
    414 	VPXOR Y8, Y3, Y3
    415 	VPXOR Y1, Y3, Y1
    416 
    417 	VPADDQ Y1, Y0, Y0
    418 
    419 	VPERM2F128 $0x0, Y0, Y0, Y4
    420 
    421 	VPAND MASK_YMM_LO<>(SB), Y0, Y0
    422 
    423 	VPERM2F128 $0x11, Y7, Y7, Y2
    424 	VPSRLQ     $6, Y2, Y8
    425 
    426 	MOVQ  R11, DI
    427 	RORXQ $41, R8, R13
    428 	RORXQ $18, R8, R14
    429 	ADDQ  1*8+frame_YFER(SP), R10
    430 	ORQ   BX, DI
    431 
    432 	MOVQ  DX, R15
    433 	RORXQ $34, R11, R12
    434 	XORQ  R14, R13
    435 	XORQ  R9, R15
    436 
    437 	RORXQ $14, R8, R14
    438 	XORQ  R14, R13
    439 	RORXQ $39, R11, R14
    440 	ANDQ  R8, R15
    441 	ADDQ  R10, CX
    442 
    443 	ANDQ AX, DI
    444 	XORQ R12, R14
    445 
    446 	RORXQ $28, R11, R12
    447 	XORQ  R9, R15
    448 
    449 	XORQ R12, R14
    450 	MOVQ R11, R12
    451 	ANDQ BX, R12
    452 	ADDQ R13, R15
    453 
    454 	ORQ  R12, DI
    455 	ADDQ R14, R10
    456 
    457 	ADDQ R15, CX
    458 	ADDQ R15, R10
    459 	ADDQ DI, R10
    460 
    461 	VPSRLQ $19, Y2, Y3
    462 	VPSLLQ $(64-19), Y2, Y1
    463 	VPOR   Y1, Y3, Y3
    464 	VPXOR  Y3, Y8, Y8
    465 	VPSRLQ $61, Y2, Y3
    466 	VPSLLQ $(64-61), Y2, Y1
    467 	VPOR   Y1, Y3, Y3
    468 	VPXOR  Y3, Y8, Y8
    469 
    470 	VPADDQ Y8, Y4, Y4
    471 
    472 	VPSRLQ $6, Y4, Y8
    473 
    474 	MOVQ  R10, DI
    475 	RORXQ $41, CX, R13
    476 	ADDQ  2*8+frame_YFER(SP), R9
    477 
    478 	RORXQ $18, CX, R14
    479 	ORQ   AX, DI
    480 	MOVQ  R8, R15
    481 	XORQ  DX, R15
    482 
    483 	RORXQ $34, R10, R12
    484 	XORQ  R14, R13
    485 	ANDQ  CX, R15
    486 
    487 	RORXQ $14, CX, R14
    488 	ADDQ  R9, BX
    489 	ANDQ  R11, DI
    490 
    491 	XORQ  R14, R13
    492 	RORXQ $39, R10, R14
    493 	XORQ  DX, R15
    494 
    495 	XORQ  R12, R14
    496 	RORXQ $28, R10, R12
    497 
    498 	XORQ R12, R14
    499 	MOVQ R10, R12
    500 	ANDQ AX, R12
    501 	ADDQ R13, R15
    502 
    503 	ORQ  R12, DI
    504 	ADDQ R14, R9
    505 	ADDQ R15, BX
    506 	ADDQ R15, R9
    507 
    508 	ADDQ DI, R9
    509 
    510 	VPSRLQ $19, Y4, Y3
    511 	VPSLLQ $(64-19), Y4, Y1
    512 	VPOR   Y1, Y3, Y3
    513 	VPXOR  Y3, Y8, Y8
    514 	VPSRLQ $61, Y4, Y3
    515 	VPSLLQ $(64-61), Y4, Y1
    516 	VPOR   Y1, Y3, Y3
    517 	VPXOR  Y3, Y8, Y8
    518 
    519 	VPADDQ Y8, Y0, Y2
    520 
    521 	VPBLENDD $0xF0, Y2, Y4, Y4
    522 
    523 	MOVQ  R9, DI
    524 	RORXQ $41, BX, R13
    525 	RORXQ $18, BX, R14
    526 	ADDQ  3*8+frame_YFER(SP), DX
    527 	ORQ   R11, DI
    528 
    529 	MOVQ  CX, R15
    530 	RORXQ $34, R9, R12
    531 	XORQ  R14, R13
    532 	XORQ  R8, R15
    533 
    534 	RORXQ $14, BX, R14
    535 	ANDQ  BX, R15
    536 	ADDQ  DX, AX
    537 	ANDQ  R10, DI
    538 
    539 	XORQ R14, R13
    540 	XORQ R8, R15
    541 
    542 	RORXQ $39, R9, R14
    543 	ADDQ  R13, R15
    544 
    545 	XORQ R12, R14
    546 	ADDQ R15, AX
    547 
    548 	RORXQ $28, R9, R12
    549 
    550 	XORQ R12, R14
    551 	MOVQ R9, R12
    552 	ANDQ R11, R12
    553 	ORQ  R12, DI
    554 
    555 	ADDQ R14, DX
    556 	ADDQ R15, DX
    557 	ADDQ DI, DX
    558 
    559 	VPADDQ  1*32(BP), Y5, Y0
    560 	VMOVDQU Y0, frame_YFER(SP)
    561 
    562 	MY_VPALIGNR(Y0, Y4, Y7, 8)
    563 
    564 	VPADDQ Y5, Y0, Y0
    565 
    566 	MY_VPALIGNR(Y1, Y6, Y5, 8)
    567 
    568 	VPSRLQ $1, Y1, Y2
    569 	VPSLLQ $(64-1), Y1, Y3
    570 	VPOR   Y2, Y3, Y3
    571 
    572 	VPSRLQ $7, Y1, Y8
    573 
    574 	MOVQ  DX, DI
    575 	RORXQ $41, AX, R13
    576 	RORXQ $18, AX, R14
    577 	ADDQ  frame_YFER(SP), R8
    578 	ORQ   R10, DI
    579 	MOVQ  BX, R15
    580 	RORXQ $34, DX, R12
    581 
    582 	XORQ  R14, R13
    583 	XORQ  CX, R15
    584 	RORXQ $14, AX, R14
    585 
    586 	ANDQ  AX, R15
    587 	XORQ  R14, R13
    588 	RORXQ $39, DX, R14
    589 	ADDQ  R8, R11
    590 
    591 	ANDQ  R9, DI
    592 	XORQ  R12, R14
    593 	RORXQ $28, DX, R12
    594 
    595 	XORQ CX, R15
    596 	XORQ R12, R14
    597 	MOVQ DX, R12
    598 	ANDQ R10, R12
    599 
    600 	ADDQ R13, R15
    601 	ORQ  R12, DI
    602 	ADDQ R14, R8
    603 
    604 	ADDQ R15, R11
    605 
    606 	ADDQ R15, R8
    607 	ADDQ DI, R8
    608 
    609 	VPSRLQ $8, Y1, Y2
    610 	VPSLLQ $(64-8), Y1, Y1
    611 	VPOR   Y2, Y1, Y1
    612 
    613 	VPXOR Y8, Y3, Y3
    614 	VPXOR Y1, Y3, Y1
    615 
    616 	VPADDQ Y1, Y0, Y0
    617 
    618 	VPERM2F128 $0x0, Y0, Y0, Y5
    619 
    620 	VPAND MASK_YMM_LO<>(SB), Y0, Y0
    621 
    622 	VPERM2F128 $0x11, Y4, Y4, Y2
    623 	VPSRLQ     $6, Y2, Y8
    624 
    625 	MOVQ  R8, DI
    626 	RORXQ $41, R11, R13
    627 	RORXQ $18, R11, R14
    628 	ADDQ  1*8+frame_YFER(SP), CX
    629 	ORQ   R9, DI
    630 
    631 	MOVQ  AX, R15
    632 	RORXQ $34, R8, R12
    633 	XORQ  R14, R13
    634 	XORQ  BX, R15
    635 
    636 	RORXQ $14, R11, R14
    637 	XORQ  R14, R13
    638 	RORXQ $39, R8, R14
    639 	ANDQ  R11, R15
    640 	ADDQ  CX, R10
    641 
    642 	ANDQ DX, DI
    643 	XORQ R12, R14
    644 
    645 	RORXQ $28, R8, R12
    646 	XORQ  BX, R15
    647 
    648 	XORQ R12, R14
    649 	MOVQ R8, R12
    650 	ANDQ R9, R12
    651 	ADDQ R13, R15
    652 
    653 	ORQ  R12, DI
    654 	ADDQ R14, CX
    655 
    656 	ADDQ R15, R10
    657 	ADDQ R15, CX
    658 	ADDQ DI, CX
    659 
    660 	VPSRLQ $19, Y2, Y3
    661 	VPSLLQ $(64-19), Y2, Y1
    662 	VPOR   Y1, Y3, Y3
    663 	VPXOR  Y3, Y8, Y8
    664 	VPSRLQ $61, Y2, Y3
    665 	VPSLLQ $(64-61), Y2, Y1
    666 	VPOR   Y1, Y3, Y3
    667 	VPXOR  Y3, Y8, Y8
    668 
    669 	VPADDQ Y8, Y5, Y5
    670 
    671 	VPSRLQ $6, Y5, Y8
    672 
    673 	MOVQ  CX, DI
    674 	RORXQ $41, R10, R13
    675 	ADDQ  2*8+frame_YFER(SP), BX
    676 
    677 	RORXQ $18, R10, R14
    678 	ORQ   DX, DI
    679 	MOVQ  R11, R15
    680 	XORQ  AX, R15
    681 
    682 	RORXQ $34, CX, R12
    683 	XORQ  R14, R13
    684 	ANDQ  R10, R15
    685 
    686 	RORXQ $14, R10, R14
    687 	ADDQ  BX, R9
    688 	ANDQ  R8, DI
    689 
    690 	XORQ  R14, R13
    691 	RORXQ $39, CX, R14
    692 	XORQ  AX, R15
    693 
    694 	XORQ  R12, R14
    695 	RORXQ $28, CX, R12
    696 
    697 	XORQ R12, R14
    698 	MOVQ CX, R12
    699 	ANDQ DX, R12
    700 	ADDQ R13, R15
    701 
    702 	ORQ  R12, DI
    703 	ADDQ R14, BX
    704 	ADDQ R15, R9
    705 	ADDQ R15, BX
    706 
    707 	ADDQ DI, BX
    708 
    709 	VPSRLQ $19, Y5, Y3
    710 	VPSLLQ $(64-19), Y5, Y1
    711 	VPOR   Y1, Y3, Y3
    712 	VPXOR  Y3, Y8, Y8
    713 	VPSRLQ $61, Y5, Y3
    714 	VPSLLQ $(64-61), Y5, Y1
    715 	VPOR   Y1, Y3, Y3
    716 	VPXOR  Y3, Y8, Y8
    717 
    718 	VPADDQ Y8, Y0, Y2
    719 
    720 	VPBLENDD $0xF0, Y2, Y5, Y5
    721 
    722 	MOVQ  BX, DI
    723 	RORXQ $41, R9, R13
    724 	RORXQ $18, R9, R14
    725 	ADDQ  3*8+frame_YFER(SP), AX
    726 	ORQ   R8, DI
    727 
    728 	MOVQ  R10, R15
    729 	RORXQ $34, BX, R12
    730 	XORQ  R14, R13
    731 	XORQ  R11, R15
    732 
    733 	RORXQ $14, R9, R14
    734 	ANDQ  R9, R15
    735 	ADDQ  AX, DX
    736 	ANDQ  CX, DI
    737 
    738 	XORQ R14, R13
    739 	XORQ R11, R15
    740 
    741 	RORXQ $39, BX, R14
    742 	ADDQ  R13, R15
    743 
    744 	XORQ R12, R14
    745 	ADDQ R15, DX
    746 
    747 	RORXQ $28, BX, R12
    748 
    749 	XORQ R12, R14
    750 	MOVQ BX, R12
    751 	ANDQ R8, R12
    752 	ORQ  R12, DI
    753 
    754 	ADDQ R14, AX
    755 	ADDQ R15, AX
    756 	ADDQ DI, AX
    757 
    758 	VPADDQ  2*32(BP), Y6, Y0
    759 	VMOVDQU Y0, frame_YFER(SP)
    760 
    761 	MY_VPALIGNR(Y0, Y5, Y4, 8)
    762 
    763 	VPADDQ Y6, Y0, Y0
    764 
    765 	MY_VPALIGNR(Y1, Y7, Y6, 8)
    766 
    767 	VPSRLQ $1, Y1, Y2
    768 	VPSLLQ $(64-1), Y1, Y3
    769 	VPOR   Y2, Y3, Y3
    770 
    771 	VPSRLQ $7, Y1, Y8
    772 
    773 	MOVQ  AX, DI
    774 	RORXQ $41, DX, R13
    775 	RORXQ $18, DX, R14
    776 	ADDQ  frame_YFER(SP), R11
    777 	ORQ   CX, DI
    778 	MOVQ  R9, R15
    779 	RORXQ $34, AX, R12
    780 
    781 	XORQ  R14, R13
    782 	XORQ  R10, R15
    783 	RORXQ $14, DX, R14
    784 
    785 	ANDQ  DX, R15
    786 	XORQ  R14, R13
    787 	RORXQ $39, AX, R14
    788 	ADDQ  R11, R8
    789 
    790 	ANDQ  BX, DI
    791 	XORQ  R12, R14
    792 	RORXQ $28, AX, R12
    793 
    794 	XORQ R10, R15
    795 	XORQ R12, R14
    796 	MOVQ AX, R12
    797 	ANDQ CX, R12
    798 
    799 	ADDQ R13, R15
    800 	ORQ  R12, DI
    801 	ADDQ R14, R11
    802 
    803 	ADDQ R15, R8
    804 
    805 	ADDQ R15, R11
    806 	ADDQ DI, R11
    807 
    808 	VPSRLQ $8, Y1, Y2
    809 	VPSLLQ $(64-8), Y1, Y1
    810 	VPOR   Y2, Y1, Y1
    811 
    812 	VPXOR Y8, Y3, Y3
    813 	VPXOR Y1, Y3, Y1
    814 
    815 	VPADDQ Y1, Y0, Y0
    816 
    817 	VPERM2F128 $0x0, Y0, Y0, Y6
    818 
    819 	VPAND MASK_YMM_LO<>(SB), Y0, Y0
    820 
    821 	VPERM2F128 $0x11, Y5, Y5, Y2
    822 	VPSRLQ     $6, Y2, Y8
    823 
    824 	MOVQ  R11, DI
    825 	RORXQ $41, R8, R13
    826 	RORXQ $18, R8, R14
    827 	ADDQ  1*8+frame_YFER(SP), R10
    828 	ORQ   BX, DI
    829 
    830 	MOVQ  DX, R15
    831 	RORXQ $34, R11, R12
    832 	XORQ  R14, R13
    833 	XORQ  R9, R15
    834 
    835 	RORXQ $14, R8, R14
    836 	XORQ  R14, R13
    837 	RORXQ $39, R11, R14
    838 	ANDQ  R8, R15
    839 	ADDQ  R10, CX
    840 
    841 	ANDQ AX, DI
    842 	XORQ R12, R14
    843 
    844 	RORXQ $28, R11, R12
    845 	XORQ  R9, R15
    846 
    847 	XORQ R12, R14
    848 	MOVQ R11, R12
    849 	ANDQ BX, R12
    850 	ADDQ R13, R15
    851 
    852 	ORQ  R12, DI
    853 	ADDQ R14, R10
    854 
    855 	ADDQ R15, CX
    856 	ADDQ R15, R10
    857 	ADDQ DI, R10
    858 
    859 	VPSRLQ $19, Y2, Y3
    860 	VPSLLQ $(64-19), Y2, Y1
    861 	VPOR   Y1, Y3, Y3
    862 	VPXOR  Y3, Y8, Y8
    863 	VPSRLQ $61, Y2, Y3
    864 	VPSLLQ $(64-61), Y2, Y1
    865 	VPOR   Y1, Y3, Y3
    866 	VPXOR  Y3, Y8, Y8
    867 
    868 	VPADDQ Y8, Y6, Y6
    869 
    870 	VPSRLQ $6, Y6, Y8
    871 
    872 	MOVQ  R10, DI
    873 	RORXQ $41, CX, R13
    874 	ADDQ  2*8+frame_YFER(SP), R9
    875 
    876 	RORXQ $18, CX, R14
    877 	ORQ   AX, DI
    878 	MOVQ  R8, R15
    879 	XORQ  DX, R15
    880 
    881 	RORXQ $34, R10, R12
    882 	XORQ  R14, R13
    883 	ANDQ  CX, R15
    884 
    885 	RORXQ $14, CX, R14
    886 	ADDQ  R9, BX
    887 	ANDQ  R11, DI
    888 
    889 	XORQ  R14, R13
    890 	RORXQ $39, R10, R14
    891 	XORQ  DX, R15
    892 
    893 	XORQ  R12, R14
    894 	RORXQ $28, R10, R12
    895 
    896 	XORQ R12, R14
    897 	MOVQ R10, R12
    898 	ANDQ AX, R12
    899 	ADDQ R13, R15
    900 
    901 	ORQ  R12, DI
    902 	ADDQ R14, R9
    903 	ADDQ R15, BX
    904 	ADDQ R15, R9
    905 
    906 	ADDQ DI, R9
    907 
    908 	VPSRLQ $19, Y6, Y3
    909 	VPSLLQ $(64-19), Y6, Y1
    910 	VPOR   Y1, Y3, Y3
    911 	VPXOR  Y3, Y8, Y8
    912 	VPSRLQ $61, Y6, Y3
    913 	VPSLLQ $(64-61), Y6, Y1
    914 	VPOR   Y1, Y3, Y3
    915 	VPXOR  Y3, Y8, Y8
    916 
    917 	VPADDQ Y8, Y0, Y2
    918 
    919 	VPBLENDD $0xF0, Y2, Y6, Y6
    920 
    921 	MOVQ  R9, DI
    922 	RORXQ $41, BX, R13
    923 	RORXQ $18, BX, R14
    924 	ADDQ  3*8+frame_YFER(SP), DX
    925 	ORQ   R11, DI
    926 
    927 	MOVQ  CX, R15
    928 	RORXQ $34, R9, R12
    929 	XORQ  R14, R13
    930 	XORQ  R8, R15
    931 
    932 	RORXQ $14, BX, R14
    933 	ANDQ  BX, R15
    934 	ADDQ  DX, AX
    935 	ANDQ  R10, DI
    936 
    937 	XORQ R14, R13
    938 	XORQ R8, R15
    939 
    940 	RORXQ $39, R9, R14
    941 	ADDQ  R13, R15
    942 
    943 	XORQ R12, R14
    944 	ADDQ R15, AX
    945 
    946 	RORXQ $28, R9, R12
    947 
    948 	XORQ R12, R14
    949 	MOVQ R9, R12
    950 	ANDQ R11, R12
    951 	ORQ  R12, DI
    952 
    953 	ADDQ R14, DX
    954 	ADDQ R15, DX
    955 	ADDQ DI, DX
    956 
    957 	VPADDQ  3*32(BP), Y7, Y0
    958 	VMOVDQU Y0, frame_YFER(SP)
    959 	ADDQ    $(4*32), BP
    960 
    961 	MY_VPALIGNR(Y0, Y6, Y5, 8)
    962 
    963 	VPADDQ Y7, Y0, Y0
    964 
    965 	MY_VPALIGNR(Y1, Y4, Y7, 8)
    966 
    967 	VPSRLQ $1, Y1, Y2
    968 	VPSLLQ $(64-1), Y1, Y3
    969 	VPOR   Y2, Y3, Y3
    970 
    971 	VPSRLQ $7, Y1, Y8
    972 
    973 	MOVQ  DX, DI
    974 	RORXQ $41, AX, R13
    975 	RORXQ $18, AX, R14
    976 	ADDQ  frame_YFER(SP), R8
    977 	ORQ   R10, DI
    978 	MOVQ  BX, R15
    979 	RORXQ $34, DX, R12
    980 
    981 	XORQ  R14, R13
    982 	XORQ  CX, R15
    983 	RORXQ $14, AX, R14
    984 
    985 	ANDQ  AX, R15
    986 	XORQ  R14, R13
    987 	RORXQ $39, DX, R14
    988 	ADDQ  R8, R11
    989 
    990 	ANDQ  R9, DI
    991 	XORQ  R12, R14
    992 	RORXQ $28, DX, R12
    993 
    994 	XORQ CX, R15
    995 	XORQ R12, R14
    996 	MOVQ DX, R12
    997 	ANDQ R10, R12
    998 
    999 	ADDQ R13, R15
   1000 	ORQ  R12, DI
   1001 	ADDQ R14, R8
   1002 
   1003 	ADDQ R15, R11
   1004 
   1005 	ADDQ R15, R8
   1006 	ADDQ DI, R8
   1007 
   1008 	VPSRLQ $8, Y1, Y2
   1009 	VPSLLQ $(64-8), Y1, Y1
   1010 	VPOR   Y2, Y1, Y1
   1011 
   1012 	VPXOR Y8, Y3, Y3
   1013 	VPXOR Y1, Y3, Y1
   1014 
   1015 	VPADDQ Y1, Y0, Y0
   1016 
   1017 	VPERM2F128 $0x0, Y0, Y0, Y7
   1018 
   1019 	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   1020 
   1021 	VPERM2F128 $0x11, Y6, Y6, Y2
   1022 	VPSRLQ     $6, Y2, Y8
   1023 
   1024 	MOVQ  R8, DI
   1025 	RORXQ $41, R11, R13
   1026 	RORXQ $18, R11, R14
   1027 	ADDQ  1*8+frame_YFER(SP), CX
   1028 	ORQ   R9, DI
   1029 
   1030 	MOVQ  AX, R15
   1031 	RORXQ $34, R8, R12
   1032 	XORQ  R14, R13
   1033 	XORQ  BX, R15
   1034 
   1035 	RORXQ $14, R11, R14
   1036 	XORQ  R14, R13
   1037 	RORXQ $39, R8, R14
   1038 	ANDQ  R11, R15
   1039 	ADDQ  CX, R10
   1040 
   1041 	ANDQ DX, DI
   1042 	XORQ R12, R14
   1043 
   1044 	RORXQ $28, R8, R12
   1045 	XORQ  BX, R15
   1046 
   1047 	XORQ R12, R14
   1048 	MOVQ R8, R12
   1049 	ANDQ R9, R12
   1050 	ADDQ R13, R15
   1051 
   1052 	ORQ  R12, DI
   1053 	ADDQ R14, CX
   1054 
   1055 	ADDQ R15, R10
   1056 	ADDQ R15, CX
   1057 	ADDQ DI, CX
   1058 
   1059 	VPSRLQ $19, Y2, Y3
   1060 	VPSLLQ $(64-19), Y2, Y1
   1061 	VPOR   Y1, Y3, Y3
   1062 	VPXOR  Y3, Y8, Y8
   1063 	VPSRLQ $61, Y2, Y3
   1064 	VPSLLQ $(64-61), Y2, Y1
   1065 	VPOR   Y1, Y3, Y3
   1066 	VPXOR  Y3, Y8, Y8
   1067 
   1068 	VPADDQ Y8, Y7, Y7
   1069 
   1070 	VPSRLQ $6, Y7, Y8
   1071 
   1072 	MOVQ  CX, DI
   1073 	RORXQ $41, R10, R13
   1074 	ADDQ  2*8+frame_YFER(SP), BX
   1075 
   1076 	RORXQ $18, R10, R14
   1077 	ORQ   DX, DI
   1078 	MOVQ  R11, R15
   1079 	XORQ  AX, R15
   1080 
   1081 	RORXQ $34, CX, R12
   1082 	XORQ  R14, R13
   1083 	ANDQ  R10, R15
   1084 
   1085 	RORXQ $14, R10, R14
   1086 	ADDQ  BX, R9
   1087 	ANDQ  R8, DI
   1088 
   1089 	XORQ  R14, R13
   1090 	RORXQ $39, CX, R14
   1091 	XORQ  AX, R15
   1092 
   1093 	XORQ  R12, R14
   1094 	RORXQ $28, CX, R12
   1095 
   1096 	XORQ R12, R14
   1097 	MOVQ CX, R12
   1098 	ANDQ DX, R12
   1099 	ADDQ R13, R15
   1100 
   1101 	ORQ  R12, DI
   1102 	ADDQ R14, BX
   1103 	ADDQ R15, R9
   1104 	ADDQ R15, BX
   1105 
   1106 	ADDQ DI, BX
   1107 
   1108 	VPSRLQ $19, Y7, Y3
   1109 	VPSLLQ $(64-19), Y7, Y1
   1110 	VPOR   Y1, Y3, Y3
   1111 	VPXOR  Y3, Y8, Y8
   1112 	VPSRLQ $61, Y7, Y3
   1113 	VPSLLQ $(64-61), Y7, Y1
   1114 	VPOR   Y1, Y3, Y3
   1115 	VPXOR  Y3, Y8, Y8
   1116 
   1117 	VPADDQ Y8, Y0, Y2
   1118 
   1119 	VPBLENDD $0xF0, Y2, Y7, Y7
   1120 
   1121 	MOVQ  BX, DI
   1122 	RORXQ $41, R9, R13
   1123 	RORXQ $18, R9, R14
   1124 	ADDQ  3*8+frame_YFER(SP), AX
   1125 	ORQ   R8, DI
   1126 
   1127 	MOVQ  R10, R15
   1128 	RORXQ $34, BX, R12
   1129 	XORQ  R14, R13
   1130 	XORQ  R11, R15
   1131 
   1132 	RORXQ $14, R9, R14
   1133 	ANDQ  R9, R15
   1134 	ADDQ  AX, DX
   1135 	ANDQ  CX, DI
   1136 
   1137 	XORQ R14, R13
   1138 	XORQ R11, R15
   1139 
   1140 	RORXQ $39, BX, R14
   1141 	ADDQ  R13, R15
   1142 
   1143 	XORQ R12, R14
   1144 	ADDQ R15, DX
   1145 
   1146 	RORXQ $28, BX, R12
   1147 
   1148 	XORQ R12, R14
   1149 	MOVQ BX, R12
   1150 	ANDQ R8, R12
   1151 	ORQ  R12, DI
   1152 
   1153 	ADDQ R14, AX
   1154 	ADDQ R15, AX
   1155 	ADDQ DI, AX
   1156 
   1157 	SUBQ $1, frame_SRND(SP)
   1158 	JNE  loop1
   1159 
   1160 	MOVQ $2, frame_SRND(SP)
   1161 
   1162 loop2:
   1163 	VPADDQ  (BP), Y4, Y0
   1164 	VMOVDQU Y0, frame_YFER(SP)
   1165 
   1166 	MOVQ  R9, R15
   1167 	RORXQ $41, DX, R13
   1168 	RORXQ $18, DX, R14
   1169 	XORQ  R10, R15
   1170 
   1171 	XORQ  R14, R13
   1172 	RORXQ $14, DX, R14
   1173 	ANDQ  DX, R15
   1174 
   1175 	XORQ  R14, R13
   1176 	RORXQ $34, AX, R12
   1177 	XORQ  R10, R15
   1178 	RORXQ $39, AX, R14
   1179 	MOVQ  AX, DI
   1180 
   1181 	XORQ  R12, R14
   1182 	RORXQ $28, AX, R12
   1183 	ADDQ  frame_YFER(SP), R11
   1184 	ORQ   CX, DI
   1185 
   1186 	XORQ R12, R14
   1187 	MOVQ AX, R12
   1188 	ANDQ BX, DI
   1189 	ANDQ CX, R12
   1190 	ADDQ R13, R15
   1191 
   1192 	ADDQ R11, R8
   1193 	ORQ  R12, DI
   1194 	ADDQ R14, R11
   1195 
   1196 	ADDQ R15, R8
   1197 
   1198 	ADDQ  R15, R11
   1199 	MOVQ  DX, R15
   1200 	RORXQ $41, R8, R13
   1201 	RORXQ $18, R8, R14
   1202 	XORQ  R9, R15
   1203 
   1204 	XORQ  R14, R13
   1205 	RORXQ $14, R8, R14
   1206 	ANDQ  R8, R15
   1207 	ADDQ  DI, R11
   1208 
   1209 	XORQ  R14, R13
   1210 	RORXQ $34, R11, R12
   1211 	XORQ  R9, R15
   1212 	RORXQ $39, R11, R14
   1213 	MOVQ  R11, DI
   1214 
   1215 	XORQ  R12, R14
   1216 	RORXQ $28, R11, R12
   1217 	ADDQ  8*1+frame_YFER(SP), R10
   1218 	ORQ   BX, DI
   1219 
   1220 	XORQ R12, R14
   1221 	MOVQ R11, R12
   1222 	ANDQ AX, DI
   1223 	ANDQ BX, R12
   1224 	ADDQ R13, R15
   1225 
   1226 	ADDQ R10, CX
   1227 	ORQ  R12, DI
   1228 	ADDQ R14, R10
   1229 
   1230 	ADDQ R15, CX
   1231 
   1232 	ADDQ  R15, R10
   1233 	MOVQ  R8, R15
   1234 	RORXQ $41, CX, R13
   1235 	RORXQ $18, CX, R14
   1236 	XORQ  DX, R15
   1237 
   1238 	XORQ  R14, R13
   1239 	RORXQ $14, CX, R14
   1240 	ANDQ  CX, R15
   1241 	ADDQ  DI, R10
   1242 
   1243 	XORQ  R14, R13
   1244 	RORXQ $34, R10, R12
   1245 	XORQ  DX, R15
   1246 	RORXQ $39, R10, R14
   1247 	MOVQ  R10, DI
   1248 
   1249 	XORQ  R12, R14
   1250 	RORXQ $28, R10, R12
   1251 	ADDQ  8*2+frame_YFER(SP), R9
   1252 	ORQ   AX, DI
   1253 
   1254 	XORQ R12, R14
   1255 	MOVQ R10, R12
   1256 	ANDQ R11, DI
   1257 	ANDQ AX, R12
   1258 	ADDQ R13, R15
   1259 
   1260 	ADDQ R9, BX
   1261 	ORQ  R12, DI
   1262 	ADDQ R14, R9
   1263 
   1264 	ADDQ R15, BX
   1265 
   1266 	ADDQ  R15, R9
   1267 	MOVQ  CX, R15
   1268 	RORXQ $41, BX, R13
   1269 	RORXQ $18, BX, R14
   1270 	XORQ  R8, R15
   1271 
   1272 	XORQ  R14, R13
   1273 	RORXQ $14, BX, R14
   1274 	ANDQ  BX, R15
   1275 	ADDQ  DI, R9
   1276 
   1277 	XORQ  R14, R13
   1278 	RORXQ $34, R9, R12
   1279 	XORQ  R8, R15
   1280 	RORXQ $39, R9, R14
   1281 	MOVQ  R9, DI
   1282 
   1283 	XORQ  R12, R14
   1284 	RORXQ $28, R9, R12
   1285 	ADDQ  8*3+frame_YFER(SP), DX
   1286 	ORQ   R11, DI
   1287 
   1288 	XORQ R12, R14
   1289 	MOVQ R9, R12
   1290 	ANDQ R10, DI
   1291 	ANDQ R11, R12
   1292 	ADDQ R13, R15
   1293 
   1294 	ADDQ DX, AX
   1295 	ORQ  R12, DI
   1296 	ADDQ R14, DX
   1297 
   1298 	ADDQ R15, AX
   1299 
   1300 	ADDQ R15, DX
   1301 
   1302 	ADDQ DI, DX
   1303 
   1304 	VPADDQ  1*32(BP), Y5, Y0
   1305 	VMOVDQU Y0, frame_YFER(SP)
   1306 	ADDQ    $(2*32), BP
   1307 
   1308 	MOVQ  BX, R15
   1309 	RORXQ $41, AX, R13
   1310 	RORXQ $18, AX, R14
   1311 	XORQ  CX, R15
   1312 
   1313 	XORQ  R14, R13
   1314 	RORXQ $14, AX, R14
   1315 	ANDQ  AX, R15
   1316 
   1317 	XORQ  R14, R13
   1318 	RORXQ $34, DX, R12
   1319 	XORQ  CX, R15
   1320 	RORXQ $39, DX, R14
   1321 	MOVQ  DX, DI
   1322 
   1323 	XORQ  R12, R14
   1324 	RORXQ $28, DX, R12
   1325 	ADDQ  frame_YFER(SP), R8
   1326 	ORQ   R10, DI
   1327 
   1328 	XORQ R12, R14
   1329 	MOVQ DX, R12
   1330 	ANDQ R9, DI
   1331 	ANDQ R10, R12
   1332 	ADDQ R13, R15
   1333 
   1334 	ADDQ R8, R11
   1335 	ORQ  R12, DI
   1336 	ADDQ R14, R8
   1337 
   1338 	ADDQ R15, R11
   1339 
   1340 	ADDQ  R15, R8
   1341 	MOVQ  AX, R15
   1342 	RORXQ $41, R11, R13
   1343 	RORXQ $18, R11, R14
   1344 	XORQ  BX, R15
   1345 
   1346 	XORQ  R14, R13
   1347 	RORXQ $14, R11, R14
   1348 	ANDQ  R11, R15
   1349 	ADDQ  DI, R8
   1350 
   1351 	XORQ  R14, R13
   1352 	RORXQ $34, R8, R12
   1353 	XORQ  BX, R15
   1354 	RORXQ $39, R8, R14
   1355 	MOVQ  R8, DI
   1356 
   1357 	XORQ  R12, R14
   1358 	RORXQ $28, R8, R12
   1359 	ADDQ  8*1+frame_YFER(SP), CX
   1360 	ORQ   R9, DI
   1361 
   1362 	XORQ R12, R14
   1363 	MOVQ R8, R12
   1364 	ANDQ DX, DI
   1365 	ANDQ R9, R12
   1366 	ADDQ R13, R15
   1367 
   1368 	ADDQ CX, R10
   1369 	ORQ  R12, DI
   1370 	ADDQ R14, CX
   1371 
   1372 	ADDQ R15, R10
   1373 
   1374 	ADDQ  R15, CX
   1375 	MOVQ  R11, R15
   1376 	RORXQ $41, R10, R13
   1377 	RORXQ $18, R10, R14
   1378 	XORQ  AX, R15
   1379 
   1380 	XORQ  R14, R13
   1381 	RORXQ $14, R10, R14
   1382 	ANDQ  R10, R15
   1383 	ADDQ  DI, CX
   1384 
   1385 	XORQ  R14, R13
   1386 	RORXQ $34, CX, R12
   1387 	XORQ  AX, R15
   1388 	RORXQ $39, CX, R14
   1389 	MOVQ  CX, DI
   1390 
   1391 	XORQ  R12, R14
   1392 	RORXQ $28, CX, R12
   1393 	ADDQ  8*2+frame_YFER(SP), BX
   1394 	ORQ   DX, DI
   1395 
   1396 	XORQ R12, R14
   1397 	MOVQ CX, R12
   1398 	ANDQ R8, DI
   1399 	ANDQ DX, R12
   1400 	ADDQ R13, R15
   1401 
   1402 	ADDQ BX, R9
   1403 	ORQ  R12, DI
   1404 	ADDQ R14, BX
   1405 
   1406 	ADDQ R15, R9
   1407 
   1408 	ADDQ  R15, BX
   1409 	MOVQ  R10, R15
   1410 	RORXQ $41, R9, R13
   1411 	RORXQ $18, R9, R14
   1412 	XORQ  R11, R15
   1413 
   1414 	XORQ  R14, R13
   1415 	RORXQ $14, R9, R14
   1416 	ANDQ  R9, R15
   1417 	ADDQ  DI, BX
   1418 
   1419 	XORQ  R14, R13
   1420 	RORXQ $34, BX, R12
   1421 	XORQ  R11, R15
   1422 	RORXQ $39, BX, R14
   1423 	MOVQ  BX, DI
   1424 
   1425 	XORQ  R12, R14
   1426 	RORXQ $28, BX, R12
   1427 	ADDQ  8*3+frame_YFER(SP), AX
   1428 	ORQ   R8, DI
   1429 
   1430 	XORQ R12, R14
   1431 	MOVQ BX, R12
   1432 	ANDQ CX, DI
   1433 	ANDQ R8, R12
   1434 	ADDQ R13, R15
   1435 
   1436 	ADDQ AX, DX
   1437 	ORQ  R12, DI
   1438 	ADDQ R14, AX
   1439 
   1440 	ADDQ R15, DX
   1441 
   1442 	ADDQ R15, AX
   1443 
   1444 	ADDQ DI, AX
   1445 
   1446 	VMOVDQU Y6, Y4
   1447 	VMOVDQU Y7, Y5
   1448 
   1449 	SUBQ $1, frame_SRND(SP)
   1450 	JNE  loop2
   1451 
   1452 	addm(8*0(SI),AX)
   1453 	addm(8*1(SI),BX)
   1454 	addm(8*2(SI),CX)
   1455 	addm(8*3(SI),R8)
   1456 	addm(8*4(SI),DX)
   1457 	addm(8*5(SI),R9)
   1458 	addm(8*6(SI),R10)
   1459 	addm(8*7(SI),R11)
   1460 
   1461 	MOVQ frame_INP(SP), DI
   1462 	ADDQ $128, DI
   1463 	CMPQ DI, frame_INPEND(SP)
   1464 	JNE  loop0
   1465 
   1466 done_hash:
   1467 	VZEROUPPER
   1468 	RET
   1469