Home | History | Annotate | Download | only in sha1
      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // AVX2 version by Intel, same algorithm as code in Linux kernel:
      6 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
      7 // Authors:
      8 // Ilya Albrekht <ilya.albrekht (at) intel.com>
      9 // Maxim Locktyukhin <maxim.locktyukhin (at) intel.com>
     10 // Ronen Zohar <ronen.zohar (at) intel.com>
     11 // Chandramouli Narayanan <mouli (at) linux.intel.com>
     12 
     13 
     14 #include "textflag.h"
     15 
     16 // SHA-1 block routine. See sha1block.go for Go equivalent.
     17 //
     18 // There are 80 rounds of 4 types:
     19 //   - rounds 0-15 are type 1 and load data (ROUND1 macro).
     20 //   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
     21 //   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
     22 //   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
     23 //   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
     24 //
     25 // Each round loads or shuffles the data, then computes a per-round
     26 // function of b, c, d, and then mixes the result into and rotates the
     27 // five registers a, b, c, d, e holding the intermediate results.
     28 //
     29 // The register rotation is implemented by rotating the arguments to
     30 // the round macros instead of by explicit move instructions.
     31 
     32 #define LOAD(index) \
     33 	MOVL	(index*4)(SI), R10; \
     34 	BSWAPL	R10; \
     35 	MOVL	R10, (index*4)(SP)
     36 
     37 #define SHUFFLE(index) \
     38 	MOVL	(((index)&0xf)*4)(SP), R10; \
     39 	XORL	(((index-3)&0xf)*4)(SP), R10; \
     40 	XORL	(((index-8)&0xf)*4)(SP), R10; \
     41 	XORL	(((index-14)&0xf)*4)(SP), R10; \
     42 	ROLL	$1, R10; \
     43 	MOVL	R10, (((index)&0xf)*4)(SP)
     44 
     45 #define FUNC1(a, b, c, d, e) \
     46 	MOVL	d, R9; \
     47 	XORL	c, R9; \
     48 	ANDL	b, R9; \
     49 	XORL	d, R9
     50 
     51 #define FUNC2(a, b, c, d, e) \
     52 	MOVL	b, R9; \
     53 	XORL	c, R9; \
     54 	XORL	d, R9
     55 
     56 #define FUNC3(a, b, c, d, e) \
     57 	MOVL	b, R8; \
     58 	ORL	c, R8; \
     59 	ANDL	d, R8; \
     60 	MOVL	b, R9; \
     61 	ANDL	c, R9; \
     62 	ORL	R8, R9
     63 
     64 #define FUNC4 FUNC2
     65 
     66 #define MIX(a, b, c, d, e, const) \
     67 	ROLL	$30, b; \
     68 	ADDL	R9, e; \
     69 	MOVL	a, R8; \
     70 	ROLL	$5, R8; \
     71 	LEAL	const(e)(R10*1), e; \
     72 	ADDL	R8, e
     73 
     74 #define ROUND1(a, b, c, d, e, index) \
     75 	LOAD(index); \
     76 	FUNC1(a, b, c, d, e); \
     77 	MIX(a, b, c, d, e, 0x5A827999)
     78 
     79 #define ROUND1x(a, b, c, d, e, index) \
     80 	SHUFFLE(index); \
     81 	FUNC1(a, b, c, d, e); \
     82 	MIX(a, b, c, d, e, 0x5A827999)
     83 
     84 #define ROUND2(a, b, c, d, e, index) \
     85 	SHUFFLE(index); \
     86 	FUNC2(a, b, c, d, e); \
     87 	MIX(a, b, c, d, e, 0x6ED9EBA1)
     88 
     89 #define ROUND3(a, b, c, d, e, index) \
     90 	SHUFFLE(index); \
     91 	FUNC3(a, b, c, d, e); \
     92 	MIX(a, b, c, d, e, 0x8F1BBCDC)
     93 
     94 #define ROUND4(a, b, c, d, e, index) \
     95 	SHUFFLE(index); \
     96 	FUNC4(a, b, c, d, e); \
     97 	MIX(a, b, c, d, e, 0xCA62C1D6)
     98 
     99 TEXT blockAMD64(SB),NOSPLIT,$64-32
    100 	MOVQ	dig+0(FP),	BP
    101 	MOVQ	p_base+8(FP),	SI
    102 	MOVQ	p_len+16(FP),	DX
    103 	SHRQ	$6,		DX
    104 	SHLQ	$6,		DX
    105 
    106 	LEAQ	(SI)(DX*1),	DI
    107 	MOVL	(0*4)(BP),	AX
    108 	MOVL	(1*4)(BP),	BX
    109 	MOVL	(2*4)(BP),	CX
    110 	MOVL	(3*4)(BP),	DX
    111 	MOVL	(4*4)(BP),	BP
    112 
    113 	CMPQ	SI,		DI
    114 	JEQ	end
    115 
    116 loop:
    117 	MOVL	AX,	R11
    118 	MOVL	BX,	R12
    119 	MOVL	CX,	R13
    120 	MOVL	DX,	R14
    121 	MOVL	BP,	R15
    122 
    123 	ROUND1(AX, BX, CX, DX, BP, 0)
    124 	ROUND1(BP, AX, BX, CX, DX, 1)
    125 	ROUND1(DX, BP, AX, BX, CX, 2)
    126 	ROUND1(CX, DX, BP, AX, BX, 3)
    127 	ROUND1(BX, CX, DX, BP, AX, 4)
    128 	ROUND1(AX, BX, CX, DX, BP, 5)
    129 	ROUND1(BP, AX, BX, CX, DX, 6)
    130 	ROUND1(DX, BP, AX, BX, CX, 7)
    131 	ROUND1(CX, DX, BP, AX, BX, 8)
    132 	ROUND1(BX, CX, DX, BP, AX, 9)
    133 	ROUND1(AX, BX, CX, DX, BP, 10)
    134 	ROUND1(BP, AX, BX, CX, DX, 11)
    135 	ROUND1(DX, BP, AX, BX, CX, 12)
    136 	ROUND1(CX, DX, BP, AX, BX, 13)
    137 	ROUND1(BX, CX, DX, BP, AX, 14)
    138 	ROUND1(AX, BX, CX, DX, BP, 15)
    139 
    140 	ROUND1x(BP, AX, BX, CX, DX, 16)
    141 	ROUND1x(DX, BP, AX, BX, CX, 17)
    142 	ROUND1x(CX, DX, BP, AX, BX, 18)
    143 	ROUND1x(BX, CX, DX, BP, AX, 19)
    144 
    145 	ROUND2(AX, BX, CX, DX, BP, 20)
    146 	ROUND2(BP, AX, BX, CX, DX, 21)
    147 	ROUND2(DX, BP, AX, BX, CX, 22)
    148 	ROUND2(CX, DX, BP, AX, BX, 23)
    149 	ROUND2(BX, CX, DX, BP, AX, 24)
    150 	ROUND2(AX, BX, CX, DX, BP, 25)
    151 	ROUND2(BP, AX, BX, CX, DX, 26)
    152 	ROUND2(DX, BP, AX, BX, CX, 27)
    153 	ROUND2(CX, DX, BP, AX, BX, 28)
    154 	ROUND2(BX, CX, DX, BP, AX, 29)
    155 	ROUND2(AX, BX, CX, DX, BP, 30)
    156 	ROUND2(BP, AX, BX, CX, DX, 31)
    157 	ROUND2(DX, BP, AX, BX, CX, 32)
    158 	ROUND2(CX, DX, BP, AX, BX, 33)
    159 	ROUND2(BX, CX, DX, BP, AX, 34)
    160 	ROUND2(AX, BX, CX, DX, BP, 35)
    161 	ROUND2(BP, AX, BX, CX, DX, 36)
    162 	ROUND2(DX, BP, AX, BX, CX, 37)
    163 	ROUND2(CX, DX, BP, AX, BX, 38)
    164 	ROUND2(BX, CX, DX, BP, AX, 39)
    165 
    166 	ROUND3(AX, BX, CX, DX, BP, 40)
    167 	ROUND3(BP, AX, BX, CX, DX, 41)
    168 	ROUND3(DX, BP, AX, BX, CX, 42)
    169 	ROUND3(CX, DX, BP, AX, BX, 43)
    170 	ROUND3(BX, CX, DX, BP, AX, 44)
    171 	ROUND3(AX, BX, CX, DX, BP, 45)
    172 	ROUND3(BP, AX, BX, CX, DX, 46)
    173 	ROUND3(DX, BP, AX, BX, CX, 47)
    174 	ROUND3(CX, DX, BP, AX, BX, 48)
    175 	ROUND3(BX, CX, DX, BP, AX, 49)
    176 	ROUND3(AX, BX, CX, DX, BP, 50)
    177 	ROUND3(BP, AX, BX, CX, DX, 51)
    178 	ROUND3(DX, BP, AX, BX, CX, 52)
    179 	ROUND3(CX, DX, BP, AX, BX, 53)
    180 	ROUND3(BX, CX, DX, BP, AX, 54)
    181 	ROUND3(AX, BX, CX, DX, BP, 55)
    182 	ROUND3(BP, AX, BX, CX, DX, 56)
    183 	ROUND3(DX, BP, AX, BX, CX, 57)
    184 	ROUND3(CX, DX, BP, AX, BX, 58)
    185 	ROUND3(BX, CX, DX, BP, AX, 59)
    186 
    187 	ROUND4(AX, BX, CX, DX, BP, 60)
    188 	ROUND4(BP, AX, BX, CX, DX, 61)
    189 	ROUND4(DX, BP, AX, BX, CX, 62)
    190 	ROUND4(CX, DX, BP, AX, BX, 63)
    191 	ROUND4(BX, CX, DX, BP, AX, 64)
    192 	ROUND4(AX, BX, CX, DX, BP, 65)
    193 	ROUND4(BP, AX, BX, CX, DX, 66)
    194 	ROUND4(DX, BP, AX, BX, CX, 67)
    195 	ROUND4(CX, DX, BP, AX, BX, 68)
    196 	ROUND4(BX, CX, DX, BP, AX, 69)
    197 	ROUND4(AX, BX, CX, DX, BP, 70)
    198 	ROUND4(BP, AX, BX, CX, DX, 71)
    199 	ROUND4(DX, BP, AX, BX, CX, 72)
    200 	ROUND4(CX, DX, BP, AX, BX, 73)
    201 	ROUND4(BX, CX, DX, BP, AX, 74)
    202 	ROUND4(AX, BX, CX, DX, BP, 75)
    203 	ROUND4(BP, AX, BX, CX, DX, 76)
    204 	ROUND4(DX, BP, AX, BX, CX, 77)
    205 	ROUND4(CX, DX, BP, AX, BX, 78)
    206 	ROUND4(BX, CX, DX, BP, AX, 79)
    207 
    208 	ADDL	R11, AX
    209 	ADDL	R12, BX
    210 	ADDL	R13, CX
    211 	ADDL	R14, DX
    212 	ADDL	R15, BP
    213 
    214 	ADDQ	$64, SI
    215 	CMPQ	SI, DI
    216 	JB	loop
    217 
    218 end:
    219 	MOVQ	dig+0(FP), DI
    220 	MOVL	AX, (0*4)(DI)
    221 	MOVL	BX, (1*4)(DI)
    222 	MOVL	CX, (2*4)(DI)
    223 	MOVL	DX, (3*4)(DI)
    224 	MOVL	BP, (4*4)(DI)
    225 	RET
    226 
    227 
    228 // This is the implementation using AVX2, BMI1 and BMI2. It is based on:
    229 // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
    230 // From http://software.intel.com/en-us/articles
    231 // (look for improving-the-performance-of-the-secure-hash-algorithm-1)
    232 // This implementation is 2x unrolled, and interleaves vector instructions,
    233 // used to precompute W, with scalar computation of current round
    234 // for optimal scheduling.
    235 
    236 // Trivial helper macros.
    237 #define UPDATE_HASH(A,TB,C,D,E) \
    238 	ADDL	(R9), A \
    239 	MOVL	A, (R9) \
    240 	ADDL	4(R9), TB \
    241 	MOVL	TB, 4(R9) \
    242 	ADDL	8(R9), C \
    243 	MOVL	C, 8(R9) \
    244 	ADDL	12(R9), D \
    245 	MOVL	D, 12(R9) \
    246 	ADDL	16(R9), E \
    247 	MOVL	E, 16(R9)
    248 
    249 
    250 
    251 // Helper macros for PRECALC, which does precomputations
    252 #define PRECALC_0(OFFSET) \
    253 	VMOVDQU   OFFSET(R10),X0
    254 
    255 #define PRECALC_1(OFFSET) \
    256 	VINSERTI128 $1, OFFSET(R13), Y0, Y0
    257 
    258 #define PRECALC_2(YREG) \
    259 	VPSHUFB Y10, Y0, YREG
    260 
    261 #define PRECALC_4(YREG,K_OFFSET) \
    262 	VPADDD K_OFFSET(R8), YREG, Y0
    263 
    264 #define PRECALC_7(OFFSET) \
    265 	VMOVDQU Y0, (OFFSET*2)(R14)
    266 
    267 
    268 // Message scheduling pre-compute for rounds 0-15
    269 // R13 is a pointer to even 64-byte block
    270 // R10 is a pointer to odd 64-byte block
    271 // R14 is a pointer to temp buffer
    272 // X0 is used as temp register
    273 // YREG is clobbered as part of computation
    274 // OFFSET chooses 16 byte chunk within a block
    275 // R8 is a pointer to constants block
    276 // K_OFFSET chooses K constants relevant to this round
    277 // X10 holds swap mask
    278 #define PRECALC_00_15(OFFSET,YREG) \
    279 	PRECALC_0(OFFSET) \
    280 	PRECALC_1(OFFSET) \
    281 	PRECALC_2(YREG) \
    282 	PRECALC_4(YREG,0x0) \
    283 	PRECALC_7(OFFSET)
    284 
    285 
    286 // Helper macros for PRECALC_16_31
    287 #define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
    288 	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
    289 	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
    290 
    291 #define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
    292 	VPXOR  REG_SUB_8, REG, REG \
    293 	VPXOR  REG_SUB_16, Y0, Y0
    294 
    295 #define PRECALC_18(REG) \
    296 	VPXOR Y0, REG, REG \
    297 	VPSLLDQ $12, REG, Y9
    298 
    299 #define PRECALC_19(REG) \
    300 	VPSLLD $1, REG, Y0 \
    301 	VPSRLD $31, REG, REG
    302 
    303 #define PRECALC_20(REG) \
    304 	VPOR REG, Y0, Y0 \
    305 	VPSLLD $2, Y9,  REG
    306 
    307 #define PRECALC_21(REG) \
    308 	VPSRLD $30, Y9, Y9 \
    309 	VPXOR REG, Y0, Y0
    310 
    311 #define PRECALC_23(REG,K_OFFSET,OFFSET) \
    312 	VPXOR Y9, Y0, REG \
    313 	VPADDD K_OFFSET(R8), REG, Y0 \
    314 	VMOVDQU Y0, (OFFSET)(R14)
    315 
    316 // Message scheduling pre-compute for rounds 16-31
    317 // calculating last 32 w[i] values in 8 XMM registers
    318 // pre-calculate K+w[i] values and store to mem
    319 // for later load by ALU add instruction.
    320 // "brute force" vectorization for rounds 16-31 only
    321 // due to w[i]->w[i-3] dependency.
    322 // clobbers 5 input ymm registers REG_SUB*
    323 // uses X0 and X9 as temp registers
    324 // As always, R8 is a pointer to constants block
    325 // and R14 is a pointer to temp buffer
    326 #define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
    327 	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
    328 	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
    329 	PRECALC_18(REG) \
    330 	PRECALC_19(REG) \
    331 	PRECALC_20(REG) \
    332 	PRECALC_21(REG) \
    333 	PRECALC_23(REG,K_OFFSET,OFFSET)
    334 
    335 
    336 // Helper macros for PRECALC_32_79
    337 #define PRECALC_32(REG_SUB_8,REG_SUB_4) \
    338 	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
    339 
    340 #define PRECALC_33(REG_SUB_28,REG) \
    341 	VPXOR REG_SUB_28, REG, REG
    342 
    343 #define PRECALC_34(REG_SUB_16) \
    344 	VPXOR REG_SUB_16, Y0, Y0
    345 
    346 #define PRECALC_35(REG) \
    347 	VPXOR Y0, REG, REG
    348 
    349 #define PRECALC_36(REG) \
    350 	VPSLLD $2, REG, Y0
    351 
    352 #define PRECALC_37(REG) \
    353 	VPSRLD $30, REG, REG \
    354 	VPOR REG, Y0, REG
    355 
    356 #define PRECALC_39(REG,K_OFFSET,OFFSET) \
    357 	VPADDD K_OFFSET(R8), REG, Y0 \
    358 	VMOVDQU Y0, (OFFSET)(R14)
    359 
    360 // Message scheduling pre-compute for rounds 32-79
    361 // In SHA-1 specification we have:
    362 // w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
    363 // Which is the same as:
    364 // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
    365 // This allows for more efficient vectorization,
    366 // since w[i]->w[i-3] dependency is broken
    367 #define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
    368 	PRECALC_32(REG_SUB_8,REG_SUB_4) \
    369 	PRECALC_33(REG_SUB_28,REG) \
    370 	PRECALC_34(REG_SUB_16) \
    371 	PRECALC_35(REG) \
    372 	PRECALC_36(REG) \
    373 	PRECALC_37(REG) \
    374 	PRECALC_39(REG,K_OFFSET,OFFSET)
    375 
    376 #define PRECALC \
    377 	PRECALC_00_15(0,Y15) \
    378 	PRECALC_00_15(0x10,Y14) \
    379 	PRECALC_00_15(0x20,Y13) \
    380 	PRECALC_00_15(0x30,Y12) \
    381 	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
    382 	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
    383 	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
    384 	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
    385 	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
    386 	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
    387 	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
    388 	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
    389 	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
    390 	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
    391 	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
    392 	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
    393 	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
    394 	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
    395 	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
    396 	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
    397 
    398 // Macros calculating individual rounds have general forn
    399 // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
    400 // CALC_ROUND_{PRE,POST} macros follow
    401 
    402 #define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
    403 	ADDL OFFSET(R15),REG_E \
    404 	ANDNL REG_C,REG_A,BP \
    405 	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
    406 	RORXL $0x1b, REG_A, R12 \
    407 	RORXL $2, REG_A, REG_B         // for next round
    408 
    409 // Calculate F for the next round
    410 #define CALC_F1_POST(REG_A,REG_B,REG_E) \
    411 	ANDL REG_B,REG_A \             // b&c
    412 	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
    413 	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
    414 
    415 
    416 // Registers are cycleickly rotated DX -> AX -> DI -> SI -> BX -> CX
    417 #define CALC_0 \
    418 	MOVL SI, BX \ // Precalculating first round
    419 	RORXL $2, SI, SI \
    420 	ANDNL AX, BX, BP \
    421 	ANDL DI, BX \
    422 	XORL BP, BX \
    423 	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
    424 	PRECALC_0(0x80) \
    425 	CALC_F1_POST(CX,SI,DX)
    426 
    427 #define CALC_1 \
    428 	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
    429 	PRECALC_1(0x80) \
    430 	CALC_F1_POST(DX,BX,AX)
    431 
    432 #define CALC_2 \
    433 	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
    434 	PRECALC_2(Y15) \
    435 	CALC_F1_POST(AX,CX,DI)
    436 
    437 #define CALC_3 \
    438 	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
    439 	CALC_F1_POST(DI,DX,SI)
    440 
    441 #define CALC_4 \
    442 	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
    443 	PRECALC_4(Y15,0x0) \
    444 	CALC_F1_POST(SI,AX,BX)
    445 
    446 #define CALC_5 \
    447 	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
    448 	CALC_F1_POST(BX,DI,CX)
    449 
    450 #define CALC_6 \
    451 	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
    452 	CALC_F1_POST(CX,SI,DX)
    453 
    454 #define CALC_7 \
    455 	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
    456 	PRECALC_7(0x0) \
    457 	CALC_F1_POST(DX,BX,AX)
    458 
    459 #define CALC_8 \
    460 	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
    461 	PRECALC_0(0x90) \
    462 	CALC_F1_POST(AX,CX,DI)
    463 
    464 #define CALC_9 \
    465 	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
    466 	PRECALC_1(0x90) \
    467 	CALC_F1_POST(DI,DX,SI)
    468 
    469 #define CALC_10 \
    470 	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
    471 	PRECALC_2(Y14) \
    472 	CALC_F1_POST(SI,AX,BX)
    473 
    474 #define CALC_11 \
    475 	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
    476 	CALC_F1_POST(BX,DI,CX)
    477 
    478 #define CALC_12 \
    479 	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
    480 	PRECALC_4(Y14,0x0) \
    481 	CALC_F1_POST(CX,SI,DX)
    482 
    483 #define CALC_13 \
    484 	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
    485 	CALC_F1_POST(DX,BX,AX)
    486 
    487 #define CALC_14 \
    488 	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
    489 	CALC_F1_POST(AX,CX,DI)
    490 
    491 #define CALC_15 \
    492 	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
    493 	PRECALC_7(0x10) \
    494 	CALC_F1_POST(DI,DX,SI)
    495 
    496 #define CALC_16 \
    497 	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
    498 	PRECALC_0(0xa0) \
    499 	CALC_F1_POST(SI,AX,BX)
    500 
    501 #define CALC_17 \
    502 	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
    503 	PRECALC_1(0xa0) \
    504 	CALC_F1_POST(BX,DI,CX)
    505 
    506 #define CALC_18 \
    507 	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
    508 	PRECALC_2(Y13) \
    509 	CALC_F1_POST(CX,SI,DX)
    510 
    511 
    512 #define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
    513 	ADDL OFFSET(R15),REG_E \
    514 	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
    515 	RORXL $0x1b, REG_A, R12 \
    516 	RORXL $2, REG_A, REG_B         // for next round
    517 
    518 #define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
    519 	XORL REG_B, REG_A \
    520 	ADDL R12, REG_E \
    521         XORL REG_C, REG_A
    522 
    523 #define CALC_19 \
    524 	CALC_F2_PRE(0x8c,DX,CX,AX) \
    525 	CALC_F2_POST(DX,BX,SI,AX)
    526 
    527 #define CALC_20 \
    528 	CALC_F2_PRE(0xa0,AX,DX,DI) \
    529 	PRECALC_4(Y13,0x0) \
    530 	CALC_F2_POST(AX,CX,BX,DI)
    531 
    532 #define CALC_21 \
    533 	CALC_F2_PRE(0xa4,DI,AX,SI) \
    534 	CALC_F2_POST(DI,DX,CX,SI)
    535 
    536 #define CALC_22 \
    537 	CALC_F2_PRE(0xa8,SI,DI,BX) \
    538 	CALC_F2_POST(SI,AX,DX,BX)
    539 
    540 #define CALC_23 \
    541 	CALC_F2_PRE(0xac,BX,SI,CX) \
    542 	PRECALC_7(0x20) \
    543 	CALC_F2_POST(BX,DI,AX,CX)
    544 
    545 #define CALC_24 \
    546 	CALC_F2_PRE(0xc0,CX,BX,DX) \
    547 	PRECALC_0(0xb0) \
    548 	CALC_F2_POST(CX,SI,DI,DX)
    549 
    550 #define CALC_25 \
    551 	CALC_F2_PRE(0xc4,DX,CX,AX) \
    552 	PRECALC_1(0xb0) \
    553 	CALC_F2_POST(DX,BX,SI,AX)
    554 
    555 #define CALC_26 \
    556 	CALC_F2_PRE(0xc8,AX,DX,DI) \
    557 	PRECALC_2(Y12) \
    558 	CALC_F2_POST(AX,CX,BX,DI)
    559 
    560 #define CALC_27 \
    561 	CALC_F2_PRE(0xcc,DI,AX,SI) \
    562 	CALC_F2_POST(DI,DX,CX,SI)
    563 
    564 #define CALC_28 \
    565 	CALC_F2_PRE(0xe0,SI,DI,BX) \
    566 	PRECALC_4(Y12,0x0) \
    567 	CALC_F2_POST(SI,AX,DX,BX)
    568 
    569 #define CALC_29 \
    570 	CALC_F2_PRE(0xe4,BX,SI,CX) \
    571 	CALC_F2_POST(BX,DI,AX,CX)
    572 
    573 #define CALC_30 \
    574 	CALC_F2_PRE(0xe8,CX,BX,DX) \
    575 	CALC_F2_POST(CX,SI,DI,DX)
    576 
    577 #define CALC_31 \
    578 	CALC_F2_PRE(0xec,DX,CX,AX) \
    579 	PRECALC_7(0x30) \
    580 	CALC_F2_POST(DX,BX,SI,AX)
    581 
    582 #define CALC_32 \
    583 	CALC_F2_PRE(0x100,AX,DX,DI) \
    584 	PRECALC_16(Y15,Y14,Y12,Y8) \
    585 	CALC_F2_POST(AX,CX,BX,DI)
    586 
    587 #define CALC_33 \
    588 	CALC_F2_PRE(0x104,DI,AX,SI) \
    589 	PRECALC_17(Y15,Y13,Y8) \
    590 	CALC_F2_POST(DI,DX,CX,SI)
    591 
    592 #define CALC_34 \
    593 	CALC_F2_PRE(0x108,SI,DI,BX) \
    594 	PRECALC_18(Y8) \
    595 	CALC_F2_POST(SI,AX,DX,BX)
    596 
    597 #define CALC_35 \
    598 	CALC_F2_PRE(0x10c,BX,SI,CX) \
    599 	PRECALC_19(Y8) \
    600 	CALC_F2_POST(BX,DI,AX,CX)
    601 
    602 #define CALC_36 \
    603 	CALC_F2_PRE(0x120,CX,BX,DX) \
    604 	PRECALC_20(Y8) \
    605 	CALC_F2_POST(CX,SI,DI,DX)
    606 
    607 #define CALC_37 \
    608 	CALC_F2_PRE(0x124,DX,CX,AX) \
    609 	PRECALC_21(Y8) \
    610 	CALC_F2_POST(DX,BX,SI,AX)
    611 
    612 #define CALC_38 \
    613 	CALC_F2_PRE(0x128,AX,DX,DI) \
    614 	CALC_F2_POST(AX,CX,BX,DI)
    615 
    616 
    617 #define CALC_F3_PRE(OFFSET,REG_E) \
    618 	ADDL OFFSET(R15),REG_E
    619 
    620 #define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
    621 	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
    622 	MOVL REG_B, BP \
    623 	ORL  REG_A, BP \
    624 	RORXL $0x1b, REG_A, R12 \
    625 	RORXL $2, REG_A, REG_TB \
    626 	ANDL REG_C, BP \		// Calculate F for the next round
    627 	ANDL REG_B, REG_A \
    628 	ORL  BP, REG_A \
    629 	ADDL R12, REG_E
    630 
    631 #define CALC_39 \
    632 	CALC_F3_PRE(0x12c,SI) \
    633 	PRECALC_23(Y8,0x0,0x80) \
    634 	CALC_F3_POST(DI,DX,CX,SI,AX)
    635 
    636 #define CALC_40 \
    637 	CALC_F3_PRE(0x140,BX) \
    638 	PRECALC_16(Y14,Y13,Y8,Y7) \
    639 	CALC_F3_POST(SI,AX,DX,BX,DI)
    640 
    641 #define CALC_41 \
    642 	CALC_F3_PRE(0x144,CX) \
    643 	PRECALC_17(Y14,Y12,Y7) \
    644 	CALC_F3_POST(BX,DI,AX,CX,SI)
    645 
    646 #define CALC_42 \
    647 	CALC_F3_PRE(0x148,DX) \
    648 	PRECALC_18(Y7) \
    649 	CALC_F3_POST(CX,SI,DI,DX,BX)
    650 
    651 #define CALC_43 \
    652 	CALC_F3_PRE(0x14c,AX) \
    653 	PRECALC_19(Y7) \
    654 	CALC_F3_POST(DX,BX,SI,AX,CX)
    655 
    656 #define CALC_44 \
    657 	CALC_F3_PRE(0x160,DI) \
    658 	PRECALC_20(Y7) \
    659 	CALC_F3_POST(AX,CX,BX,DI,DX)
    660 
    661 #define CALC_45 \
    662 	CALC_F3_PRE(0x164,SI) \
    663 	PRECALC_21(Y7) \
    664 	CALC_F3_POST(DI,DX,CX,SI,AX)
    665 
    666 #define CALC_46 \
    667 	CALC_F3_PRE(0x168,BX) \
    668 	CALC_F3_POST(SI,AX,DX,BX,DI)
    669 
    670 #define CALC_47 \
    671 	CALC_F3_PRE(0x16c,CX) \
    672 	VPXOR Y9, Y0, Y7 \
    673 	VPADDD 0x20(R8), Y7, Y0 \
    674 	VMOVDQU Y0, 0xa0(R14) \
    675 	CALC_F3_POST(BX,DI,AX,CX,SI)
    676 
    677 #define CALC_48 \
    678 	CALC_F3_PRE(0x180,DX) \
    679 	PRECALC_16(Y13,Y12,Y7,Y5) \
    680 	CALC_F3_POST(CX,SI,DI,DX,BX)
    681 
    682 #define CALC_49 \
    683 	CALC_F3_PRE(0x184,AX) \
    684 	PRECALC_17(Y13,Y8,Y5) \
    685 	CALC_F3_POST(DX,BX,SI,AX,CX)
    686 
    687 #define CALC_50 \
    688 	CALC_F3_PRE(0x188,DI) \
    689 	PRECALC_18(Y5) \
    690 	CALC_F3_POST(AX,CX,BX,DI,DX)
    691 
    692 #define CALC_51 \
    693 	CALC_F3_PRE(0x18c,SI) \
    694 	PRECALC_19(Y5) \
    695 	CALC_F3_POST(DI,DX,CX,SI,AX)
    696 
    697 #define CALC_52 \
    698 	CALC_F3_PRE(0x1a0,BX) \
    699 	PRECALC_20(Y5) \
    700 	CALC_F3_POST(SI,AX,DX,BX,DI)
    701 
    702 #define CALC_53 \
    703 	CALC_F3_PRE(0x1a4,CX) \
    704 	PRECALC_21(Y5) \
    705 	CALC_F3_POST(BX,DI,AX,CX,SI)
    706 
    707 #define CALC_54 \
    708 	CALC_F3_PRE(0x1a8,DX) \
    709 	CALC_F3_POST(CX,SI,DI,DX,BX)
    710 
    711 #define CALC_55 \
    712 	CALC_F3_PRE(0x1ac,AX) \
    713 	PRECALC_23(Y5,0x20,0xc0) \
    714 	CALC_F3_POST(DX,BX,SI,AX,CX)
    715 
    716 #define CALC_56 \
    717 	CALC_F3_PRE(0x1c0,DI) \
    718 	PRECALC_16(Y12,Y8,Y5,Y3) \
    719 	CALC_F3_POST(AX,CX,BX,DI,DX)
    720 
    721 #define CALC_57 \
    722 	CALC_F3_PRE(0x1c4,SI) \
    723 	PRECALC_17(Y12,Y7,Y3) \
    724 	CALC_F3_POST(DI,DX,CX,SI,AX)
    725 
    726 #define CALC_58 \
    727 	CALC_F3_PRE(0x1c8,BX) \
    728 	PRECALC_18(Y3) \
    729 	CALC_F3_POST(SI,AX,DX,BX,DI)
    730 
    731 #define CALC_59 \
    732 	CALC_F2_PRE(0x1cc,BX,SI,CX) \
    733 	PRECALC_19(Y3) \
    734 	CALC_F2_POST(BX,DI,AX,CX)
    735 
    736 #define CALC_60 \
    737 	CALC_F2_PRE(0x1e0,CX,BX,DX) \
    738 	PRECALC_20(Y3) \
    739 	CALC_F2_POST(CX,SI,DI,DX)
    740 
    741 #define CALC_61 \
    742 	CALC_F2_PRE(0x1e4,DX,CX,AX) \
    743 	PRECALC_21(Y3) \
    744 	CALC_F2_POST(DX,BX,SI,AX)
    745 
    746 #define CALC_62 \
    747 	CALC_F2_PRE(0x1e8,AX,DX,DI) \
    748 	CALC_F2_POST(AX,CX,BX,DI)
    749 
    750 #define CALC_63 \
    751 	CALC_F2_PRE(0x1ec,DI,AX,SI) \
    752 	PRECALC_23(Y3,0x20,0xe0) \
    753 	CALC_F2_POST(DI,DX,CX,SI)
    754 
    755 #define CALC_64 \
    756 	CALC_F2_PRE(0x200,SI,DI,BX) \
    757 	PRECALC_32(Y5,Y3) \
    758 	CALC_F2_POST(SI,AX,DX,BX)
    759 
    760 #define CALC_65 \
    761 	CALC_F2_PRE(0x204,BX,SI,CX) \
    762 	PRECALC_33(Y14,Y15) \
    763 	CALC_F2_POST(BX,DI,AX,CX)
    764 
    765 #define CALC_66 \
    766 	CALC_F2_PRE(0x208,CX,BX,DX) \
    767 	PRECALC_34(Y8) \
    768 	CALC_F2_POST(CX,SI,DI,DX)
    769 
    770 #define CALC_67 \
    771 	CALC_F2_PRE(0x20c,DX,CX,AX) \
    772 	PRECALC_35(Y15) \
    773 	CALC_F2_POST(DX,BX,SI,AX)
    774 
    775 #define CALC_68 \
    776 	CALC_F2_PRE(0x220,AX,DX,DI) \
    777 	PRECALC_36(Y15) \
    778 	CALC_F2_POST(AX,CX,BX,DI)
    779 
    780 #define CALC_69 \
    781 	CALC_F2_PRE(0x224,DI,AX,SI) \
    782 	PRECALC_37(Y15) \
    783 	CALC_F2_POST(DI,DX,CX,SI)
    784 
    785 #define CALC_70 \
    786 	CALC_F2_PRE(0x228,SI,DI,BX) \
    787 	CALC_F2_POST(SI,AX,DX,BX)
    788 
    789 #define CALC_71 \
    790 	CALC_F2_PRE(0x22c,BX,SI,CX) \
    791 	PRECALC_39(Y15,0x20,0x100) \
    792 	CALC_F2_POST(BX,DI,AX,CX)
    793 
    794 #define CALC_72 \
    795 	CALC_F2_PRE(0x240,CX,BX,DX) \
    796 	PRECALC_32(Y3,Y15) \
    797 	CALC_F2_POST(CX,SI,DI,DX)
    798 
    799 #define CALC_73 \
    800 	CALC_F2_PRE(0x244,DX,CX,AX) \
    801 	PRECALC_33(Y13,Y14) \
    802 	CALC_F2_POST(DX,BX,SI,AX)
    803 
    804 #define CALC_74 \
    805 	CALC_F2_PRE(0x248,AX,DX,DI) \
    806 	PRECALC_34(Y7) \
    807 	CALC_F2_POST(AX,CX,BX,DI)
    808 
    809 #define CALC_75 \
    810 	CALC_F2_PRE(0x24c,DI,AX,SI) \
    811 	PRECALC_35(Y14) \
    812 	CALC_F2_POST(DI,DX,CX,SI)
    813 
    814 #define CALC_76 \
    815 	CALC_F2_PRE(0x260,SI,DI,BX) \
    816 	PRECALC_36(Y14) \
    817 	CALC_F2_POST(SI,AX,DX,BX)
    818 
    819 #define CALC_77 \
    820 	CALC_F2_PRE(0x264,BX,SI,CX) \
    821 	PRECALC_37(Y14) \
    822 	CALC_F2_POST(BX,DI,AX,CX)
    823 
    824 #define CALC_78 \
    825 	CALC_F2_PRE(0x268,CX,BX,DX) \
    826 	CALC_F2_POST(CX,SI,DI,DX)
    827 
    828 #define CALC_79 \
    829 	ADDL 0x26c(R15), AX \
    830 	LEAL (AX)(CX*1), AX \
    831 	RORXL $0x1b, DX, R12 \
    832 	PRECALC_39(Y14,0x20,0x120) \
    833 	ADDL R12, AX
    834 
    835 // Similar to CALC_0
    836 #define CALC_80 \
    837 	MOVL CX, DX \
    838 	RORXL $2, CX, CX \
    839 	ANDNL SI, DX, BP \
    840 	ANDL BX, DX \
    841 	XORL BP, DX \
    842 	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
    843 	PRECALC_32(Y15,Y14) \
    844 	CALC_F1_POST(AX,CX,DI)
    845 
    846 #define CALC_81 \
    847 	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
    848 	PRECALC_33(Y12,Y13) \
    849 	CALC_F1_POST(DI,DX,SI)
    850 
    851 #define CALC_82 \
    852 	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
    853 	PRECALC_34(Y5) \
    854 	CALC_F1_POST(SI,AX,BX)
    855 
    856 #define CALC_83 \
    857 	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
    858 	PRECALC_35(Y13) \
    859 	CALC_F1_POST(BX,DI,CX)
    860 
    861 #define CALC_84 \
    862 	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
    863 	PRECALC_36(Y13) \
    864 	CALC_F1_POST(CX,SI,DX)
    865 
    866 #define CALC_85 \
    867 	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
    868 	PRECALC_37(Y13) \
    869 	CALC_F1_POST(DX,BX,AX)
    870 
    871 #define CALC_86 \
    872 	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
    873 	CALC_F1_POST(AX,CX,DI)
    874 
    875 #define CALC_87 \
    876 	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
    877 	PRECALC_39(Y13,0x40,0x140) \
    878 	CALC_F1_POST(DI,DX,SI)
    879 
    880 #define CALC_88 \
    881 	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
    882 	PRECALC_32(Y14,Y13) \
    883 	CALC_F1_POST(SI,AX,BX)
    884 
    885 #define CALC_89 \
    886 	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
    887 	PRECALC_33(Y8,Y12) \
    888 	CALC_F1_POST(BX,DI,CX)
    889 
    890 #define CALC_90 \
    891 	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
    892 	PRECALC_34(Y3) \
    893 	CALC_F1_POST(CX,SI,DX)
    894 
    895 #define CALC_91 \
    896 	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
    897 	PRECALC_35(Y12) \
    898 	CALC_F1_POST(DX,BX,AX)
    899 
    900 #define CALC_92 \
    901 	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
    902 	PRECALC_36(Y12) \
    903 	CALC_F1_POST(AX,CX,DI)
    904 
    905 #define CALC_93 \
    906 	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
    907 	PRECALC_37(Y12) \
    908 	CALC_F1_POST(DI,DX,SI)
    909 
    910 #define CALC_94 \
    911 	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
    912 	CALC_F1_POST(SI,AX,BX)
    913 
    914 #define CALC_95 \
    915 	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
    916 	PRECALC_39(Y12,0x40,0x160) \
    917 	CALC_F1_POST(BX,DI,CX)
    918 
    919 #define CALC_96 \
    920 	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
    921 	PRECALC_32(Y13,Y12) \
    922 	CALC_F1_POST(CX,SI,DX)
    923 
    924 #define CALC_97 \
    925 	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
    926 	PRECALC_33(Y7,Y8) \
    927 	CALC_F1_POST(DX,BX,AX)
    928 
    929 #define CALC_98 \
    930 	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
    931 	PRECALC_34(Y15) \
    932 	CALC_F1_POST(AX,CX,DI)
    933 
    934 #define CALC_99 \
    935 	CALC_F2_PRE(0x9c,DI,AX,SI) \
    936 	PRECALC_35(Y8) \
    937 	CALC_F2_POST(DI,DX,CX,SI)
    938 
    939 #define CALC_100 \
    940 	CALC_F2_PRE(0xb0,SI,DI,BX) \
    941 	PRECALC_36(Y8) \
    942 	CALC_F2_POST(SI,AX,DX,BX)
    943 
    944 #define CALC_101 \
    945 	CALC_F2_PRE(0xb4,BX,SI,CX) \
    946 	PRECALC_37(Y8) \
    947 	CALC_F2_POST(BX,DI,AX,CX)
    948 
    949 #define CALC_102 \
    950 	CALC_F2_PRE(0xb8,CX,BX,DX) \
    951 	CALC_F2_POST(CX,SI,DI,DX)
    952 
    953 #define CALC_103 \
    954 	CALC_F2_PRE(0xbc,DX,CX,AX) \
    955 	PRECALC_39(Y8,0x40,0x180) \
    956 	CALC_F2_POST(DX,BX,SI,AX)
    957 
    958 #define CALC_104 \
    959 	CALC_F2_PRE(0xd0,AX,DX,DI) \
    960 	PRECALC_32(Y12,Y8) \
    961 	CALC_F2_POST(AX,CX,BX,DI)
    962 
    963 #define CALC_105 \
    964 	CALC_F2_PRE(0xd4,DI,AX,SI) \
    965 	PRECALC_33(Y5,Y7) \
    966 	CALC_F2_POST(DI,DX,CX,SI)
    967 
    968 #define CALC_106 \
    969 	CALC_F2_PRE(0xd8,SI,DI,BX) \
    970 	PRECALC_34(Y14) \
    971 	CALC_F2_POST(SI,AX,DX,BX)
    972 
    973 #define CALC_107 \
    974 	CALC_F2_PRE(0xdc,BX,SI,CX) \
    975 	PRECALC_35(Y7) \
    976 	CALC_F2_POST(BX,DI,AX,CX)
    977 
    978 #define CALC_108 \
    979 	CALC_F2_PRE(0xf0,CX,BX,DX) \
    980 	PRECALC_36(Y7) \
    981 	CALC_F2_POST(CX,SI,DI,DX)
    982 
    983 #define CALC_109 \
    984 	CALC_F2_PRE(0xf4,DX,CX,AX) \
    985 	PRECALC_37(Y7) \
    986 	CALC_F2_POST(DX,BX,SI,AX)
    987 
    988 #define CALC_110 \
    989 	CALC_F2_PRE(0xf8,AX,DX,DI) \
    990 	CALC_F2_POST(AX,CX,BX,DI)
    991 
    992 #define CALC_111 \
    993 	CALC_F2_PRE(0xfc,DI,AX,SI) \
    994 	PRECALC_39(Y7,0x40,0x1a0) \
    995 	CALC_F2_POST(DI,DX,CX,SI)
    996 
    997 #define CALC_112 \
    998 	CALC_F2_PRE(0x110,SI,DI,BX) \
    999 	PRECALC_32(Y8,Y7) \
   1000 	CALC_F2_POST(SI,AX,DX,BX)
   1001 
   1002 #define CALC_113 \
   1003 	CALC_F2_PRE(0x114,BX,SI,CX) \
   1004 	PRECALC_33(Y3,Y5) \
   1005 	CALC_F2_POST(BX,DI,AX,CX)
   1006 
   1007 #define CALC_114 \
   1008 	CALC_F2_PRE(0x118,CX,BX,DX) \
   1009 	PRECALC_34(Y13) \
   1010 	CALC_F2_POST(CX,SI,DI,DX)
   1011 
   1012 #define CALC_115 \
   1013 	CALC_F2_PRE(0x11c,DX,CX,AX) \
   1014 	PRECALC_35(Y5) \
   1015 	CALC_F2_POST(DX,BX,SI,AX)
   1016 
   1017 #define CALC_116 \
   1018 	CALC_F2_PRE(0x130,AX,DX,DI) \
   1019 	PRECALC_36(Y5) \
   1020 	CALC_F2_POST(AX,CX,BX,DI)
   1021 
   1022 #define CALC_117 \
   1023 	CALC_F2_PRE(0x134,DI,AX,SI) \
   1024 	PRECALC_37(Y5) \
   1025 	CALC_F2_POST(DI,DX,CX,SI)
   1026 
   1027 #define CALC_118 \
   1028 	CALC_F2_PRE(0x138,SI,DI,BX) \
   1029 	CALC_F2_POST(SI,AX,DX,BX)
   1030 
   1031 #define CALC_119 \
   1032 	CALC_F3_PRE(0x13c,CX) \
   1033 	PRECALC_39(Y5,0x40,0x1c0) \
   1034 	CALC_F3_POST(BX,DI,AX,CX,SI)
   1035 
   1036 #define CALC_120 \
   1037 	CALC_F3_PRE(0x150,DX) \
   1038 	PRECALC_32(Y7,Y5) \
   1039 	CALC_F3_POST(CX,SI,DI,DX,BX)
   1040 
   1041 #define CALC_121 \
   1042 	CALC_F3_PRE(0x154,AX) \
   1043 	PRECALC_33(Y15,Y3) \
   1044 	CALC_F3_POST(DX,BX,SI,AX,CX)
   1045 
   1046 #define CALC_122 \
   1047 	CALC_F3_PRE(0x158,DI) \
   1048 	PRECALC_34(Y12) \
   1049 	CALC_F3_POST(AX,CX,BX,DI,DX)
   1050 
   1051 #define CALC_123 \
   1052 	CALC_F3_PRE(0x15c,SI) \
   1053 	PRECALC_35(Y3) \
   1054 	CALC_F3_POST(DI,DX,CX,SI,AX)
   1055 
   1056 #define CALC_124 \
   1057 	CALC_F3_PRE(0x170,BX) \
   1058 	PRECALC_36(Y3) \
   1059 	CALC_F3_POST(SI,AX,DX,BX,DI)
   1060 
   1061 #define CALC_125 \
   1062 	CALC_F3_PRE(0x174,CX) \
   1063 	PRECALC_37(Y3) \
   1064 	CALC_F3_POST(BX,DI,AX,CX,SI)
   1065 
   1066 #define CALC_126 \
   1067 	CALC_F3_PRE(0x178,DX) \
   1068 	CALC_F3_POST(CX,SI,DI,DX,BX)
   1069 
   1070 #define CALC_127 \
   1071 	CALC_F3_PRE(0x17c,AX) \
   1072 	PRECALC_39(Y3,0x60,0x1e0) \
   1073 	CALC_F3_POST(DX,BX,SI,AX,CX)
   1074 
   1075 #define CALC_128 \
   1076 	CALC_F3_PRE(0x190,DI) \
   1077 	PRECALC_32(Y5,Y3) \
   1078 	CALC_F3_POST(AX,CX,BX,DI,DX)
   1079 
   1080 #define CALC_129 \
   1081 	CALC_F3_PRE(0x194,SI) \
   1082 	PRECALC_33(Y14,Y15) \
   1083 	CALC_F3_POST(DI,DX,CX,SI,AX)
   1084 
   1085 #define CALC_130 \
   1086 	CALC_F3_PRE(0x198,BX) \
   1087 	PRECALC_34(Y8) \
   1088 	CALC_F3_POST(SI,AX,DX,BX,DI)
   1089 
   1090 #define CALC_131 \
   1091 	CALC_F3_PRE(0x19c,CX) \
   1092 	PRECALC_35(Y15) \
   1093 	CALC_F3_POST(BX,DI,AX,CX,SI)
   1094 
   1095 #define CALC_132 \
   1096 	CALC_F3_PRE(0x1b0,DX) \
   1097 	PRECALC_36(Y15) \
   1098 	CALC_F3_POST(CX,SI,DI,DX,BX)
   1099 
   1100 #define CALC_133 \
   1101 	CALC_F3_PRE(0x1b4,AX) \
   1102 	PRECALC_37(Y15) \
   1103 	CALC_F3_POST(DX,BX,SI,AX,CX)
   1104 
   1105 #define CALC_134 \
   1106 	CALC_F3_PRE(0x1b8,DI) \
   1107 	CALC_F3_POST(AX,CX,BX,DI,DX)
   1108 
   1109 #define CALC_135 \
   1110 	CALC_F3_PRE(0x1bc,SI) \
   1111 	PRECALC_39(Y15,0x60,0x200) \
   1112 	CALC_F3_POST(DI,DX,CX,SI,AX)
   1113 
   1114 #define CALC_136 \
   1115 	CALC_F3_PRE(0x1d0,BX) \
   1116 	PRECALC_32(Y3,Y15) \
   1117 	CALC_F3_POST(SI,AX,DX,BX,DI)
   1118 
   1119 #define CALC_137 \
   1120 	CALC_F3_PRE(0x1d4,CX) \
   1121 	PRECALC_33(Y13,Y14) \
   1122 	CALC_F3_POST(BX,DI,AX,CX,SI)
   1123 
   1124 #define CALC_138 \
   1125 	CALC_F3_PRE(0x1d8,DX) \
   1126 	PRECALC_34(Y7) \
   1127 	CALC_F3_POST(CX,SI,DI,DX,BX)
   1128 
   1129 #define CALC_139 \
   1130 	CALC_F2_PRE(0x1dc,DX,CX,AX) \
   1131 	PRECALC_35(Y14) \
   1132 	CALC_F2_POST(DX,BX,SI,AX)
   1133 
   1134 #define CALC_140 \
   1135 	CALC_F2_PRE(0x1f0,AX,DX,DI) \
   1136 	PRECALC_36(Y14) \
   1137 	CALC_F2_POST(AX,CX,BX,DI)
   1138 
   1139 #define CALC_141 \
   1140 	CALC_F2_PRE(0x1f4,DI,AX,SI) \
   1141 	PRECALC_37(Y14) \
   1142 	CALC_F2_POST(DI,DX,CX,SI)
   1143 
   1144 #define CALC_142 \
   1145 	CALC_F2_PRE(0x1f8,SI,DI,BX) \
   1146 	CALC_F2_POST(SI,AX,DX,BX)
   1147 
   1148 #define CALC_143 \
   1149 	CALC_F2_PRE(0x1fc,BX,SI,CX) \
   1150 	PRECALC_39(Y14,0x60,0x220) \
   1151 	CALC_F2_POST(BX,DI,AX,CX)
   1152 
   1153 #define CALC_144 \
   1154 	CALC_F2_PRE(0x210,CX,BX,DX) \
   1155 	PRECALC_32(Y15,Y14) \
   1156 	CALC_F2_POST(CX,SI,DI,DX)
   1157 
   1158 #define CALC_145 \
   1159 	CALC_F2_PRE(0x214,DX,CX,AX) \
   1160 	PRECALC_33(Y12,Y13) \
   1161 	CALC_F2_POST(DX,BX,SI,AX)
   1162 
   1163 #define CALC_146 \
   1164 	CALC_F2_PRE(0x218,AX,DX,DI) \
   1165 	PRECALC_34(Y5) \
   1166 	CALC_F2_POST(AX,CX,BX,DI)
   1167 
   1168 #define CALC_147 \
   1169 	CALC_F2_PRE(0x21c,DI,AX,SI) \
   1170 	PRECALC_35(Y13) \
   1171 	CALC_F2_POST(DI,DX,CX,SI)
   1172 
   1173 #define CALC_148 \
   1174 	CALC_F2_PRE(0x230,SI,DI,BX) \
   1175 	PRECALC_36(Y13) \
   1176 	CALC_F2_POST(SI,AX,DX,BX)
   1177 
   1178 #define CALC_149 \
   1179 	CALC_F2_PRE(0x234,BX,SI,CX) \
   1180 	PRECALC_37(Y13) \
   1181 	CALC_F2_POST(BX,DI,AX,CX)
   1182 
   1183 #define CALC_150 \
   1184 	CALC_F2_PRE(0x238,CX,BX,DX) \
   1185 	CALC_F2_POST(CX,SI,DI,DX)
   1186 
   1187 #define CALC_151 \
   1188 	CALC_F2_PRE(0x23c,DX,CX,AX) \
   1189 	PRECALC_39(Y13,0x60,0x240) \
   1190 	CALC_F2_POST(DX,BX,SI,AX)
   1191 
   1192 #define CALC_152 \
   1193 	CALC_F2_PRE(0x250,AX,DX,DI) \
   1194 	PRECALC_32(Y14,Y13) \
   1195 	CALC_F2_POST(AX,CX,BX,DI)
   1196 
   1197 #define CALC_153 \
   1198 	CALC_F2_PRE(0x254,DI,AX,SI) \
   1199 	PRECALC_33(Y8,Y12) \
   1200 	CALC_F2_POST(DI,DX,CX,SI)
   1201 
   1202 #define CALC_154 \
   1203 	CALC_F2_PRE(0x258,SI,DI,BX) \
   1204 	PRECALC_34(Y3) \
   1205 	CALC_F2_POST(SI,AX,DX,BX)
   1206 
   1207 #define CALC_155 \
   1208 	CALC_F2_PRE(0x25c,BX,SI,CX) \
   1209 	PRECALC_35(Y12) \
   1210 	CALC_F2_POST(BX,DI,AX,CX)
   1211 
   1212 #define CALC_156 \
   1213 	CALC_F2_PRE(0x270,CX,BX,DX) \
   1214 	PRECALC_36(Y12) \
   1215 	CALC_F2_POST(CX,SI,DI,DX)
   1216 
   1217 #define CALC_157 \
   1218 	CALC_F2_PRE(0x274,DX,CX,AX) \
   1219 	PRECALC_37(Y12) \
   1220 	CALC_F2_POST(DX,BX,SI,AX)
   1221 
   1222 #define CALC_158 \
   1223 	CALC_F2_PRE(0x278,AX,DX,DI) \
   1224 	CALC_F2_POST(AX,CX,BX,DI)
   1225 
   1226 #define CALC_159 \
   1227 	ADDL 0x27c(R15),SI \
   1228 	LEAL (SI)(AX*1), SI \
   1229 	RORXL $0x1b, DI, R12 \
   1230 	PRECALC_39(Y12,0x60,0x260) \
   1231 	ADDL R12, SI
   1232 
   1233 
   1234 
   1235 #define CALC \
   1236 	MOVL	(R9), CX \
   1237 	MOVL	4(R9), SI \
   1238 	MOVL	8(R9), DI \
   1239 	MOVL	12(R9), AX \
   1240 	MOVL	16(R9), DX \
   1241 	MOVQ    SP, R14 \
   1242 	LEAQ    (2*4*80+32)(SP), R15 \
   1243 	PRECALC \ // Precalc WK for first 2 blocks
   1244 	XCHGQ   R15, R14 \
   1245 loop: \  // this loops is unrolled
   1246 	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
   1247 	JNE	begin \
   1248 	VZEROUPPER \
   1249 	RET \
   1250 begin: \
   1251 	CALC_0 \
   1252 	CALC_1 \
   1253 	CALC_2 \
   1254 	CALC_3 \
   1255 	CALC_4 \
   1256 	CALC_5 \
   1257 	CALC_6 \
   1258 	CALC_7 \
   1259 	CALC_8 \
   1260 	CALC_9 \
   1261 	CALC_10 \
   1262 	CALC_11 \
   1263 	CALC_12 \
   1264 	CALC_13 \
   1265 	CALC_14 \
   1266 	CALC_15 \
   1267 	CALC_16 \
   1268 	CALC_17 \
   1269 	CALC_18 \
   1270 	CALC_19 \
   1271 	CALC_20 \
   1272 	CALC_21 \
   1273 	CALC_22 \
   1274 	CALC_23 \
   1275 	CALC_24 \
   1276 	CALC_25 \
   1277 	CALC_26 \
   1278 	CALC_27 \
   1279 	CALC_28 \
   1280 	CALC_29 \
   1281 	CALC_30 \
   1282 	CALC_31 \
   1283 	CALC_32 \
   1284 	CALC_33 \
   1285 	CALC_34 \
   1286 	CALC_35 \
   1287 	CALC_36 \
   1288 	CALC_37 \
   1289 	CALC_38 \
   1290 	CALC_39 \
   1291 	CALC_40 \
   1292 	CALC_41 \
   1293 	CALC_42 \
   1294 	CALC_43 \
   1295 	CALC_44 \
   1296 	CALC_45 \
   1297 	CALC_46 \
   1298 	CALC_47 \
   1299 	CALC_48 \
   1300 	CALC_49 \
   1301 	CALC_50 \
   1302 	CALC_51 \
   1303 	CALC_52 \
   1304 	CALC_53 \
   1305 	CALC_54 \
   1306 	CALC_55 \
   1307 	CALC_56 \
   1308 	CALC_57 \
   1309 	CALC_58 \
   1310 	CALC_59 \
   1311 	ADDQ $128, R10 \ // move to next even-64-byte block
   1312 	CMPQ R10, R11 \ // is current block the last one?
   1313 	CMOVQCC R8, R10 \ // signal the last iteration smartly
   1314 	CALC_60 \
   1315 	CALC_61 \
   1316 	CALC_62 \
   1317 	CALC_63 \
   1318 	CALC_64 \
   1319 	CALC_65 \
   1320 	CALC_66 \
   1321 	CALC_67 \
   1322 	CALC_68 \
   1323 	CALC_69 \
   1324 	CALC_70 \
   1325 	CALC_71 \
   1326 	CALC_72 \
   1327 	CALC_73 \
   1328 	CALC_74 \
   1329 	CALC_75 \
   1330 	CALC_76 \
   1331 	CALC_77 \
   1332 	CALC_78 \
   1333 	CALC_79 \
   1334 	UPDATE_HASH(AX,DX,BX,SI,DI) \
   1335 	CMPQ R10, R8 \ // is current block the last one?
   1336 	JE loop\
   1337 	MOVL DX, CX \
   1338 	CALC_80 \
   1339 	CALC_81 \
   1340 	CALC_82 \
   1341 	CALC_83 \
   1342 	CALC_84 \
   1343 	CALC_85 \
   1344 	CALC_86 \
   1345 	CALC_87 \
   1346 	CALC_88 \
   1347 	CALC_89 \
   1348 	CALC_90 \
   1349 	CALC_91 \
   1350 	CALC_92 \
   1351 	CALC_93 \
   1352 	CALC_94 \
   1353 	CALC_95 \
   1354 	CALC_96 \
   1355 	CALC_97 \
   1356 	CALC_98 \
   1357 	CALC_99 \
   1358 	CALC_100 \
   1359 	CALC_101 \
   1360 	CALC_102 \
   1361 	CALC_103 \
   1362 	CALC_104 \
   1363 	CALC_105 \
   1364 	CALC_106 \
   1365 	CALC_107 \
   1366 	CALC_108 \
   1367 	CALC_109 \
   1368 	CALC_110 \
   1369 	CALC_111 \
   1370 	CALC_112 \
   1371 	CALC_113 \
   1372 	CALC_114 \
   1373 	CALC_115 \
   1374 	CALC_116 \
   1375 	CALC_117 \
   1376 	CALC_118 \
   1377 	CALC_119 \
   1378 	CALC_120 \
   1379 	CALC_121 \
   1380 	CALC_122 \
   1381 	CALC_123 \
   1382 	CALC_124 \
   1383 	CALC_125 \
   1384 	CALC_126 \
   1385 	CALC_127 \
   1386 	CALC_128 \
   1387 	CALC_129 \
   1388 	CALC_130 \
   1389 	CALC_131 \
   1390 	CALC_132 \
   1391 	CALC_133 \
   1392 	CALC_134 \
   1393 	CALC_135 \
   1394 	CALC_136 \
   1395 	CALC_137 \
   1396 	CALC_138 \
   1397 	CALC_139 \
   1398 	ADDQ $128, R13 \ //move to next even-64-byte block
   1399 	CMPQ R13, R11 \ //is current block the last one?
   1400 	CMOVQCC R8, R10 \
   1401 	CALC_140 \
   1402 	CALC_141 \
   1403 	CALC_142 \
   1404 	CALC_143 \
   1405 	CALC_144 \
   1406 	CALC_145 \
   1407 	CALC_146 \
   1408 	CALC_147 \
   1409 	CALC_148 \
   1410 	CALC_149 \
   1411 	CALC_150 \
   1412 	CALC_151 \
   1413 	CALC_152 \
   1414 	CALC_153 \
   1415 	CALC_154 \
   1416 	CALC_155 \
   1417 	CALC_156 \
   1418 	CALC_157 \
   1419 	CALC_158 \
   1420 	CALC_159 \
   1421 	UPDATE_HASH(SI,DI,DX,CX,BX) \
   1422 	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
   1423 	MOVL	DI, SI \
   1424 	MOVL	DX, DI \
   1425 	MOVL	BX, DX \
   1426 	MOVL	CX, AX \
   1427 	MOVL	R12, CX \
   1428 	XCHGQ   R15, R14 \
   1429 	JMP     loop
   1430 
   1431 
   1432 
   1433 TEXT blockAVX2(SB),$1408-32
   1434 
   1435 	MOVQ	dig+0(FP),	DI
   1436 	MOVQ	p_base+8(FP),	SI
   1437 	MOVQ	p_len+16(FP),	DX
   1438 	SHRQ	$6,		DX
   1439 	SHLQ	$6,		DX
   1440 
   1441 	MOVQ	$K_XMM_AR<>(SB), R8
   1442 
   1443 	MOVQ	DI, R9
   1444 	MOVQ	SI, R10
   1445 	LEAQ	64(SI), R13
   1446 
   1447 	ADDQ	SI, DX
   1448 	ADDQ	$64, DX
   1449 	MOVQ	DX, R11
   1450 
   1451 	CMPQ	R13, R11
   1452 	CMOVQCC	R8, R13
   1453 
   1454 	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
   1455 
   1456 	CALC // RET is inside macros
   1457 
   1458 DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
   1459 DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
   1460 DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
   1461 DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
   1462 DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
   1463 DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
   1464 DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
   1465 DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
   1466 DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
   1467 DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
   1468 DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
   1469 DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
   1470 DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
   1471 DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
   1472 DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
   1473 DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
   1474 DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
   1475 DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
   1476 DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
   1477 DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
   1478 DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
   1479 DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
   1480 DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
   1481 DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
   1482 DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
   1483 DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
   1484 DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
   1485 DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
   1486 DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
   1487 DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
   1488 DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
   1489 DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
   1490 GLOBL K_XMM_AR<>(SB),RODATA,$128
   1491 
   1492 DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
   1493 DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
   1494 DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
   1495 DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
   1496 DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
   1497 DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
   1498 DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
   1499 DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
   1500 GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
   1501