Home | History | Annotate | Download | only in sha256
      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "textflag.h"
      6 
      7 // SHA256 block routine. See sha256block.go for Go equivalent.
      8 //
      9 // The algorithm is detailed in FIPS 180-4:
     10 //
     11 //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
     12 
     13 // The avx2-version is described in an Intel White-Paper:
     14 // "Fast SHA-256 Implementations on Intel Architecture Processors"
     15 // To find it, surf to http://www.intel.com/p/en_US/embedded
     16 // and search for that title.
     17 // AVX2 version by Intel, same algorithm as code in Linux kernel:
     18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
     19 // by
     20 //     James Guilford <james.guilford (at) intel.com>
     21 //     Kirk Yap <kirk.s.yap (at) intel.com>
     22 //     Tim Chen <tim.c.chen (at) linux.intel.com>
     23 
     24 // Wt = Mt; for 0 <= t <= 15
     25 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
     26 //
     27 // a = H0
     28 // b = H1
     29 // c = H2
     30 // d = H3
     31 // e = H4
     32 // f = H5
     33 // g = H6
     34 // h = H7
     35 //
     36 // for t = 0 to 63 {
     37 //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
     38 //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
     39 //    h = g
     40 //    g = f
     41 //    f = e
     42 //    e = d + T1
     43 //    d = c
     44 //    c = b
     45 //    b = a
     46 //    a = T1 + T2
     47 // }
     48 //
     49 // H0 = a + H0
     50 // H1 = b + H1
     51 // H2 = c + H2
     52 // H3 = d + H3
     53 // H4 = e + H4
     54 // H5 = f + H5
     55 // H6 = g + H6
     56 // H7 = h + H7
     57 
     58 // Wt = Mt; for 0 <= t <= 15
     59 #define MSGSCHEDULE0(index) \
     60 	MOVL	(index*4)(SI), AX; \
     61 	BSWAPL	AX; \
     62 	MOVL	AX, (index*4)(BP)
     63 
     64 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
     65 //   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
     66 //   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
     67 #define MSGSCHEDULE1(index) \
     68 	MOVL	((index-2)*4)(BP), AX; \
     69 	MOVL	AX, CX; \
     70 	RORL	$17, AX; \
     71 	MOVL	CX, DX; \
     72 	RORL	$19, CX; \
     73 	SHRL	$10, DX; \
     74 	MOVL	((index-15)*4)(BP), BX; \
     75 	XORL	CX, AX; \
     76 	MOVL	BX, CX; \
     77 	XORL	DX, AX; \
     78 	RORL	$7, BX; \
     79 	MOVL	CX, DX; \
     80 	SHRL	$3, DX; \
     81 	RORL	$18, CX; \
     82 	ADDL	((index-7)*4)(BP), AX; \
     83 	XORL	CX, BX; \
     84 	XORL	DX, BX; \
     85 	ADDL	((index-16)*4)(BP), BX; \
     86 	ADDL	BX, AX; \
     87 	MOVL	AX, ((index)*4)(BP)
     88 
     89 // Calculate T1 in AX - uses AX, CX and DX registers.
     90 // h is also used as an accumulator. Wt is passed in AX.
     91 //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
     92 //     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
     93 //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
     94 #define SHA256T1(const, e, f, g, h) \
     95 	ADDL	AX, h; \
     96 	MOVL	e, AX; \
     97 	ADDL	$const, h; \
     98 	MOVL	e, CX; \
     99 	RORL	$6, AX; \
    100 	MOVL	e, DX; \
    101 	RORL	$11, CX; \
    102 	XORL	CX, AX; \
    103 	MOVL	e, CX; \
    104 	RORL	$25, DX; \
    105 	ANDL	f, CX; \
    106 	XORL	AX, DX; \
    107 	MOVL	e, AX; \
    108 	NOTL	AX; \
    109 	ADDL	DX, h; \
    110 	ANDL	g, AX; \
    111 	XORL	CX, AX; \
    112 	ADDL	h, AX
    113 
    114 // Calculate T2 in BX - uses BX, CX, DX and DI registers.
    115 //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
    116 //     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
    117 //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
    118 #define SHA256T2(a, b, c) \
    119 	MOVL	a, DI; \
    120 	MOVL	c, BX; \
    121 	RORL	$2, DI; \
    122 	MOVL	a, DX; \
    123 	ANDL	b, BX; \
    124 	RORL	$13, DX; \
    125 	MOVL	a, CX; \
    126 	ANDL	c, CX; \
    127 	XORL	DX, DI; \
    128 	XORL	CX, BX; \
    129 	MOVL	a, DX; \
    130 	MOVL	b, CX; \
    131 	RORL	$22, DX; \
    132 	ANDL	a, CX; \
    133 	XORL	CX, BX; \
    134 	XORL	DX, DI; \
    135 	ADDL	DI, BX
    136 
    137 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
    138 // The values for e and a are stored in d and h, ready for rotation.
    139 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
    140 	SHA256T1(const, e, f, g, h); \
    141 	SHA256T2(a, b, c); \
    142 	MOVL	BX, h; \
    143 	ADDL	AX, d; \
    144 	ADDL	AX, h
    145 
    146 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
    147 	MSGSCHEDULE0(index); \
    148 	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
    149 
    150 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
    151 	MSGSCHEDULE1(index); \
    152 	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
    153 
    154 
    155 // Definitions for AVX2 version
    156 
    157 // addm (mem), reg
    158 // Add reg to mem using reg-mem add and store
    159 #define addm(P1, P2) \
    160 	ADDL P2, P1; \
    161 	MOVL P1, P2
    162 
    163 #define XDWORD0 Y4
    164 #define XDWORD1 Y5
    165 #define XDWORD2 Y6
    166 #define XDWORD3 Y7
    167 
    168 #define XWORD0 X4
    169 #define XWORD1 X5
    170 #define XWORD2 X6
    171 #define XWORD3 X7
    172 
    173 #define XTMP0 Y0
    174 #define XTMP1 Y1
    175 #define XTMP2 Y2
    176 #define XTMP3 Y3
    177 #define XTMP4 Y8
    178 #define XTMP5 Y11
    179 
    180 #define XFER  Y9
    181 
    182 #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
    183 #define X_BYTE_FLIP_MASK X13
    184 
    185 #define NUM_BYTES DX
    186 #define INP	DI
    187 
    188 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
    189 
    190 #define a AX
    191 #define b BX
    192 #define c CX
    193 #define d R8
    194 #define e DX
    195 #define f R9
    196 #define g R10
    197 #define h R11
    198 
    199 #define old_h R11
    200 
    201 #define TBL BP
    202 
    203 #define SRND SI // SRND is same register as CTX
    204 
    205 #define T1 R12
    206 
    207 #define y0 R13
    208 #define y1 R14
    209 #define y2 R15
    210 #define y3 DI
    211 
    212 // Offsets
    213 #define XFER_SIZE 2*64*4
    214 #define INP_END_SIZE 8
    215 #define INP_SIZE 8
    216 
    217 #define _XFER 0
    218 #define _INP_END _XFER + XFER_SIZE
    219 #define _INP _INP_END + INP_END_SIZE
    220 #define STACK_SIZE _INP + INP_SIZE
    221 
    222 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
    223 	;                                     \ // #############################  RND N + 0 ############################//
    224 	MOVL     a, y3;                       \ // y3 = a					// MAJA
    225 	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
    226 	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
    227 	;                                     \
    228 	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
    229 	ORL      c, y3;                       \ // y3 = a|c				// MAJA
    230 	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
    231 	MOVL     f, y2;                       \ // y2 = f				// CH
    232 	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
    233 	;                                     \
    234 	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
    235 	XORL     g, y2;                       \ // y2 = f^g                              	// CH
    236 	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
    237 	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
    238 	;                                     \
    239 	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
    240 	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
    241 	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
    242 	ADDL     h, d;                        \ // d = k + w + h + d                     	// --
    243 	;                                     \
    244 	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
    245 	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
    246 	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
    247 	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
    248 	;                                     \
    249 	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
    250 	VPSRLD   $7, XTMP1, XTMP2;            \
    251 	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
    252 	MOVL     a, T1;                       \ // T1 = a								// MAJB
    253 	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
    254 	;                                     \
    255 	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
    256 	VPSLLD   $(32-7), XTMP1, XTMP3;       \
    257 	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
    258 	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
    259 	;                                     \
    260 	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
    261 	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
    262 	;                                     \
    263 	VPSRLD   $18, XTMP1, XTMP2;           \
    264 	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    265 	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
    266 
    267 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
    268 	;                                    \ // ################################### RND N + 1 ############################
    269 	;                                    \
    270 	MOVL    a, y3;                       \ // y3 = a                       // MAJA
    271 	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
    272 	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
    273 	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
    274 	ORL     c, y3;                       \ // y3 = a|c						// MAJA
    275 	;                                    \
    276 	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
    277 	MOVL    f, y2;                       \ // y2 = f						// CH
    278 	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
    279 	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
    280 	XORL    g, y2;                       \ // y2 = f^g						// CH
    281 	;                                    \
    282 	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
    283 	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
    284 	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
    285 	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
    286 	ADDL    h, d;                        \ // d = k + w + h + d				// --
    287 	;                                    \
    288 	VPSLLD  $(32-18), XTMP1, XTMP1;      \
    289 	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
    290 	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
    291 	;                                    \
    292 	VPXOR   XTMP1, XTMP3, XTMP3;         \
    293 	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
    294 	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
    295 	;                                    \
    296 	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
    297 	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
    298 	MOVL    a, T1;                       \ // T1 = a						// MAJB
    299 	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
    300 	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
    301 	;                                    \
    302 	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
    303 	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
    304 	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
    305 	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
    306 	;                                    \
    307 	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
    308 	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
    309 	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    310 	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
    311 	;                                    \
    312 	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
    313 
    314 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
    315 	;                                    \ // ################################### RND N + 2 ############################
    316 	;                                    \
    317 	MOVL    a, y3;                       \ // y3 = a							// MAJA
    318 	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
    319 	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
    320 	;                                    \
    321 	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
    322 	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
    323 	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
    324 	MOVL    f, y2;                       \ // y2 = f                           // CH
    325 	XORL    g, y2;                       \ // y2 = f^g                         // CH
    326 	;                                    \
    327 	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
    328 	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
    329 	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
    330 	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
    331 	;                                    \
    332 	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
    333 	VPXOR   XTMP3, XTMP2, XTMP2;         \
    334 	ADDL    h, d;                        \ // d = k + w + h + d				// --
    335 	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
    336 	;                                    \
    337 	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
    338 	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
    339 	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
    340 	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
    341 	;                                    \
    342 	VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
    343 	;                                    \
    344 	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
    345 	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
    346 	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
    347 	;                                    \
    348 	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
    349 	MOVL    a, T1;                       \ // T1 = a                                // MAJB
    350 	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
    351 	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
    352 	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
    353 	;                                    \
    354 	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
    355 	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
    356 	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
    357 	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    358 	;                                    \
    359 	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
    360 
    361 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
    362 	;                                    \ // ################################### RND N + 3 ############################
    363 	;                                    \
    364 	MOVL    a, y3;                       \ // y3 = a						// MAJA
    365 	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
    366 	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
    367 	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
    368 	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
    369 	;                                    \
    370 	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
    371 	MOVL    f, y2;                       \ // y2 = f						// CH
    372 	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
    373 	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
    374 	XORL    g, y2;                       \ // y2 = f^g						// CH
    375 	;                                    \
    376 	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
    377 	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
    378 	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
    379 	ADDL    h, d;                        \ // d = k + w + h + d			// --
    380 	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
    381 	;                                    \
    382 	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
    383 	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
    384 	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
    385 	;                                    \
    386 	VPXOR   XTMP3, XTMP2, XTMP2;         \
    387 	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
    388 	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
    389 	;                                    \
    390 	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
    391 	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
    392 	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
    393 	;                                    \
    394 	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
    395 	;                                    \
    396 	VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
    397 	;                                    \
    398 	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
    399 	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
    400 	MOVL    a, T1;                       \ // T1 = a							// MAJB
    401 	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
    402 	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
    403 	;                                    \
    404 	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
    405 	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    406 	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
    407 
    408 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
    409 	;                                  \ // ################################### RND N + 0 ###########################
    410 	MOVL  f, y2;                       \ // y2 = f					// CH
    411 	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
    412 	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
    413 	XORL  g, y2;                       \ // y2 = f^g					// CH
    414 	;                                  \
    415 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
    416 	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
    417 	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
    418 	;                                  \
    419 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
    420 	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
    421 	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
    422 	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
    423 	MOVL  a, y3;                       \ // y3 = a							// MAJA
    424 	;                                  \
    425 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
    426 	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
    427 	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
    428 	ORL   c, y3;                       \ // y3 = a|c							// MAJA
    429 	;                                  \
    430 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
    431 	MOVL  a, T1;                       \ // T1 = a							// MAJB
    432 	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
    433 	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
    434 	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
    435 	;                                  \
    436 	ADDL  h, d;                        \ // d = k + w + h + d					// --
    437 	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
    438 	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
    439 	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
    440 
    441 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
    442 	;                                  \ // ################################### RND N + 1 ###########################
    443 	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
    444 	MOVL  f, y2;                       \ // y2 = f                                // CH
    445 	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
    446 	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
    447 	XORL  g, y2;                       \ // y2 = f^g                             // CH
    448 	;                                  \
    449 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
    450 	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
    451 	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
    452 	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
    453 	;                                  \
    454 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
    455 	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
    456 	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
    457 	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
    458 	MOVL  a, y3;                       \ // y3 = a                               // MAJA
    459 	;                                  \
    460 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
    461 	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
    462 	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
    463 	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
    464 	;                                  \
    465 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
    466 	MOVL  a, T1;                       \ // T1 = a                               // MAJB
    467 	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
    468 	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
    469 	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
    470 	;                                  \
    471 	ADDL  h, d;                        \ // d = k + w + h + d                    // --
    472 	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
    473 	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
    474 	;                                  \
    475 	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
    476 
    477 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
    478 	;                                  \ // ################################### RND N + 2 ##############################
    479 	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    480 	MOVL  f, y2;                       \ // y2 = f								// CH
    481 	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
    482 	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
    483 	XORL  g, y2;                       \ // y2 = f^g								// CH
    484 	;                                  \
    485 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
    486 	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
    487 	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
    488 	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
    489 	;                                  \
    490 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
    491 	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
    492 	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
    493 	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
    494 	MOVL  a, y3;                       \ // y3 = a								// MAJA
    495 	;                                  \
    496 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
    497 	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
    498 	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
    499 	ORL   c, y3;                       \ // y3 = a|c								// MAJA
    500 	;                                  \
    501 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
    502 	MOVL  a, T1;                       \ // T1 = a								// MAJB
    503 	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
    504 	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
    505 	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
    506 	;                                  \
    507 	ADDL  h, d;                        \ // d = k + w + h + d					// --
    508 	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
    509 	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
    510 	;                                  \
    511 	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
    512 
    513 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
    514 	;                                  \ // ################################### RND N + 3 ###########################
    515 	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    516 	MOVL  f, y2;                       \ // y2 = f								// CH
    517 	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
    518 	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
    519 	XORL  g, y2;                       \ // y2 = f^g								// CH
    520 	;                                  \
    521 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
    522 	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
    523 	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
    524 	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
    525 	;                                  \
    526 	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
    527 	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
    528 	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
    529 	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
    530 	MOVL  a, y3;                       \ // y3 = a								// MAJA
    531 	;                                  \
    532 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
    533 	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
    534 	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
    535 	ORL   c, y3;                       \ // y3 = a|c								// MAJA
    536 	;                                  \
    537 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
    538 	MOVL  a, T1;                       \ // T1 = a								// MAJB
    539 	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
    540 	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
    541 	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
    542 	;                                  \
    543 	ADDL  h, d;                        \ // d = k + w + h + d					// --
    544 	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
    545 	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
    546 	;                                  \
    547 	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
    548 	;                                  \
    549 	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
    550 	;                                  \
    551 	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
    552 
    553 TEXT block(SB), 0, $536-32
    554 	CMPB useAVX2(SB), $1
    555 	JE   avx2
    556 
    557 	MOVQ p_base+8(FP), SI
    558 	MOVQ p_len+16(FP), DX
    559 	SHRQ $6, DX
    560 	SHLQ $6, DX
    561 
    562 	LEAQ (SI)(DX*1), DI
    563 	MOVQ DI, 256(SP)
    564 	CMPQ SI, DI
    565 	JEQ  end
    566 
    567 	MOVQ dig+0(FP), BP
    568 	MOVL (0*4)(BP), R8  // a = H0
    569 	MOVL (1*4)(BP), R9  // b = H1
    570 	MOVL (2*4)(BP), R10 // c = H2
    571 	MOVL (3*4)(BP), R11 // d = H3
    572 	MOVL (4*4)(BP), R12 // e = H4
    573 	MOVL (5*4)(BP), R13 // f = H5
    574 	MOVL (6*4)(BP), R14 // g = H6
    575 	MOVL (7*4)(BP), R15 // h = H7
    576 
    577 loop:
    578 	MOVQ SP, BP
    579 
    580 	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
    581 	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
    582 	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
    583 	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
    584 	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
    585 	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
    586 	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
    587 	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
    588 	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
    589 	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
    590 	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
    591 	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
    592 	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
    593 	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
    594 	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
    595 	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
    596 
    597 	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
    598 	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
    599 	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
    600 	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
    601 	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
    602 	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
    603 	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
    604 	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
    605 	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
    606 	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
    607 	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
    608 	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
    609 	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
    610 	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
    611 	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
    612 	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
    613 	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
    614 	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
    615 	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
    616 	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
    617 	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
    618 	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
    619 	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
    620 	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
    621 	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
    622 	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
    623 	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
    624 	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
    625 	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
    626 	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
    627 	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
    628 	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
    629 	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
    630 	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
    631 	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
    632 	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
    633 	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
    634 	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
    635 	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
    636 	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
    637 	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
    638 	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
    639 	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
    640 	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
    641 	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
    642 	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
    643 	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
    644 	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
    645 
    646 	MOVQ dig+0(FP), BP
    647 	ADDL (0*4)(BP), R8  // H0 = a + H0
    648 	MOVL R8, (0*4)(BP)
    649 	ADDL (1*4)(BP), R9  // H1 = b + H1
    650 	MOVL R9, (1*4)(BP)
    651 	ADDL (2*4)(BP), R10 // H2 = c + H2
    652 	MOVL R10, (2*4)(BP)
    653 	ADDL (3*4)(BP), R11 // H3 = d + H3
    654 	MOVL R11, (3*4)(BP)
    655 	ADDL (4*4)(BP), R12 // H4 = e + H4
    656 	MOVL R12, (4*4)(BP)
    657 	ADDL (5*4)(BP), R13 // H5 = f + H5
    658 	MOVL R13, (5*4)(BP)
    659 	ADDL (6*4)(BP), R14 // H6 = g + H6
    660 	MOVL R14, (6*4)(BP)
    661 	ADDL (7*4)(BP), R15 // H7 = h + H7
    662 	MOVL R15, (7*4)(BP)
    663 
    664 	ADDQ $64, SI
    665 	CMPQ SI, 256(SP)
    666 	JB   loop
    667 
    668 end:
    669 	RET
    670 
    671 avx2:
    672 	MOVQ dig+0(FP), CTX          // d.h[8]
    673 	MOVQ p_base+8(FP), INP
    674 	MOVQ p_len+16(FP), NUM_BYTES
    675 
    676 	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
    677 	MOVQ NUM_BYTES, _INP_END(SP)
    678 
    679 	CMPQ NUM_BYTES, INP
    680 	JE   avx2_only_one_block
    681 
    682 	// Load initial digest
    683 	MOVL 0(CTX), a  // a = H0
    684 	MOVL 4(CTX), b  // b = H1
    685 	MOVL 8(CTX), c  // c = H2
    686 	MOVL 12(CTX), d // d = H3
    687 	MOVL 16(CTX), e // e = H4
    688 	MOVL 20(CTX), f // f = H5
    689 	MOVL 24(CTX), g // g = H6
    690 	MOVL 28(CTX), h // h = H7
    691 
    692 avx2_loop0: // at each iteration works with one block (512 bit)
    693 
    694 	VMOVDQU (0*32)(INP), XTMP0
    695 	VMOVDQU (1*32)(INP), XTMP1
    696 	VMOVDQU (2*32)(INP), XTMP2
    697 	VMOVDQU (3*32)(INP), XTMP3
    698 
    699 	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
    700 
    701 	// Apply Byte Flip Mask: LE -> BE
    702 	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
    703 	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
    704 	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
    705 	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
    706 
    707 	// Transpose data into high/low parts
    708 	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
    709 	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
    710 	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
    711 	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
    712 
    713 	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
    714 
    715 avx2_last_block_enter:
    716 	ADDQ $64, INP
    717 	MOVQ INP, _INP(SP)
    718 	XORQ SRND, SRND
    719 
    720 avx2_loop1: // for w0 - w47
    721 	// Do 4 rounds and scheduling
    722 	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
    723 	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
    724 	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
    725 	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
    726 	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
    727 	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
    728 
    729 	// Do 4 rounds and scheduling
    730 	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
    731 	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
    732 	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
    733 	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
    734 	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
    735 	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
    736 
    737 	// Do 4 rounds and scheduling
    738 	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
    739 	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
    740 	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
    741 	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
    742 	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
    743 	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
    744 
    745 	// Do 4 rounds and scheduling
    746 	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
    747 	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
    748 	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
    749 	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
    750 	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
    751 	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
    752 
    753 	ADDQ $4*32, SRND
    754 	CMPQ SRND, $3*4*32
    755 	JB   avx2_loop1
    756 
    757 avx2_loop2:
    758 	// w48 - w63 processed with no scheduliung (last 16 rounds)
    759 	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
    760 	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
    761 	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
    762 	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
    763 	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
    764 	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
    765 
    766 	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
    767 	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
    768 	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
    769 	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
    770 	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
    771 	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
    772 
    773 	ADDQ $2*32, SRND
    774 
    775 	VMOVDQU XDWORD2, XDWORD0
    776 	VMOVDQU XDWORD3, XDWORD1
    777 
    778 	CMPQ SRND, $4*4*32
    779 	JB   avx2_loop2
    780 
    781 	MOVQ dig+0(FP), CTX // d.h[8]
    782 	MOVQ _INP(SP), INP
    783 
    784 	addm(  0(CTX), a)
    785 	addm(  4(CTX), b)
    786 	addm(  8(CTX), c)
    787 	addm( 12(CTX), d)
    788 	addm( 16(CTX), e)
    789 	addm( 20(CTX), f)
    790 	addm( 24(CTX), g)
    791 	addm( 28(CTX), h)
    792 
    793 	CMPQ _INP_END(SP), INP
    794 	JB   done_hash
    795 
    796 	XORQ SRND, SRND
    797 
    798 avx2_loop3: // Do second block using previously scheduled results
    799 	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
    800 	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
    801 	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
    802 	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
    803 
    804 	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
    805 	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
    806 	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
    807 	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
    808 
    809 	ADDQ $2*32, SRND
    810 	CMPQ SRND, $4*4*32
    811 	JB   avx2_loop3
    812 
    813 	MOVQ dig+0(FP), CTX // d.h[8]
    814 	MOVQ _INP(SP), INP
    815 	ADDQ $64, INP
    816 
    817 	addm(  0(CTX), a)
    818 	addm(  4(CTX), b)
    819 	addm(  8(CTX), c)
    820 	addm( 12(CTX), d)
    821 	addm( 16(CTX), e)
    822 	addm( 20(CTX), f)
    823 	addm( 24(CTX), g)
    824 	addm( 28(CTX), h)
    825 
    826 	CMPQ _INP_END(SP), INP
    827 	JA   avx2_loop0
    828 	JB   done_hash
    829 
    830 avx2_do_last_block:
    831 
    832 	VMOVDQU 0(INP), XWORD0
    833 	VMOVDQU 16(INP), XWORD1
    834 	VMOVDQU 32(INP), XWORD2
    835 	VMOVDQU 48(INP), XWORD3
    836 
    837 	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
    838 
    839 	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
    840 	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
    841 	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
    842 	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
    843 
    844 	MOVQ $K256<>(SB), TBL
    845 
    846 	JMP avx2_last_block_enter
    847 
    848 avx2_only_one_block:
    849 	// Load initial digest
    850 	MOVL 0(CTX), a  // a = H0
    851 	MOVL 4(CTX), b  // b = H1
    852 	MOVL 8(CTX), c  // c = H2
    853 	MOVL 12(CTX), d // d = H3
    854 	MOVL 16(CTX), e // e = H4
    855 	MOVL 20(CTX), f // f = H5
    856 	MOVL 24(CTX), g // g = H6
    857 	MOVL 28(CTX), h // h = H7
    858 
    859 	JMP avx2_do_last_block
    860 
    861 done_hash:
    862 	VZEROUPPER
    863 	RET
    864 
    865 // shuffle byte order from LE to BE
    866 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
    867 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
    868 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
    869 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
    870 GLOBL flip_mask<>(SB), 8, $32
    871 
    872 // shuffle xBxA -> 00BA
    873 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
    874 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
    875 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
    876 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
    877 GLOBL shuff_00BA<>(SB), 8, $32
    878 
    879 // shuffle xDxC -> DC00
    880 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
    881 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
    882 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
    883 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
    884 GLOBL shuff_DC00<>(SB), 8, $32
    885 
    886 // Round specific constants
    887 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
    888 DATA K256<>+0x04(SB)/4, $0x71374491 // k2
    889 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
    890 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
    891 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
    892 DATA K256<>+0x14(SB)/4, $0x71374491 // k2
    893 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
    894 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
    895 
    896 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
    897 DATA K256<>+0x24(SB)/4, $0x59f111f1
    898 DATA K256<>+0x28(SB)/4, $0x923f82a4
    899 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
    900 DATA K256<>+0x30(SB)/4, $0x3956c25b
    901 DATA K256<>+0x34(SB)/4, $0x59f111f1
    902 DATA K256<>+0x38(SB)/4, $0x923f82a4
    903 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
    904 
    905 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
    906 DATA K256<>+0x44(SB)/4, $0x12835b01
    907 DATA K256<>+0x48(SB)/4, $0x243185be
    908 DATA K256<>+0x4c(SB)/4, $0x550c7dc3
    909 DATA K256<>+0x50(SB)/4, $0xd807aa98
    910 DATA K256<>+0x54(SB)/4, $0x12835b01
    911 DATA K256<>+0x58(SB)/4, $0x243185be
    912 DATA K256<>+0x5c(SB)/4, $0x550c7dc3
    913 
    914 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
    915 DATA K256<>+0x64(SB)/4, $0x80deb1fe
    916 DATA K256<>+0x68(SB)/4, $0x9bdc06a7
    917 DATA K256<>+0x6c(SB)/4, $0xc19bf174
    918 DATA K256<>+0x70(SB)/4, $0x72be5d74
    919 DATA K256<>+0x74(SB)/4, $0x80deb1fe
    920 DATA K256<>+0x78(SB)/4, $0x9bdc06a7
    921 DATA K256<>+0x7c(SB)/4, $0xc19bf174
    922 
    923 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
    924 DATA K256<>+0x84(SB)/4, $0xefbe4786
    925 DATA K256<>+0x88(SB)/4, $0x0fc19dc6
    926 DATA K256<>+0x8c(SB)/4, $0x240ca1cc
    927 DATA K256<>+0x90(SB)/4, $0xe49b69c1
    928 DATA K256<>+0x94(SB)/4, $0xefbe4786
    929 DATA K256<>+0x98(SB)/4, $0x0fc19dc6
    930 DATA K256<>+0x9c(SB)/4, $0x240ca1cc
    931 
    932 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
    933 DATA K256<>+0xa4(SB)/4, $0x4a7484aa
    934 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
    935 DATA K256<>+0xac(SB)/4, $0x76f988da
    936 DATA K256<>+0xb0(SB)/4, $0x2de92c6f
    937 DATA K256<>+0xb4(SB)/4, $0x4a7484aa
    938 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
    939 DATA K256<>+0xbc(SB)/4, $0x76f988da
    940 
    941 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
    942 DATA K256<>+0xc4(SB)/4, $0xa831c66d
    943 DATA K256<>+0xc8(SB)/4, $0xb00327c8
    944 DATA K256<>+0xcc(SB)/4, $0xbf597fc7
    945 DATA K256<>+0xd0(SB)/4, $0x983e5152
    946 DATA K256<>+0xd4(SB)/4, $0xa831c66d
    947 DATA K256<>+0xd8(SB)/4, $0xb00327c8
    948 DATA K256<>+0xdc(SB)/4, $0xbf597fc7
    949 
    950 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
    951 DATA K256<>+0xe4(SB)/4, $0xd5a79147
    952 DATA K256<>+0xe8(SB)/4, $0x06ca6351
    953 DATA K256<>+0xec(SB)/4, $0x14292967
    954 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
    955 DATA K256<>+0xf4(SB)/4, $0xd5a79147
    956 DATA K256<>+0xf8(SB)/4, $0x06ca6351
    957 DATA K256<>+0xfc(SB)/4, $0x14292967
    958 
    959 DATA K256<>+0x100(SB)/4, $0x27b70a85
    960 DATA K256<>+0x104(SB)/4, $0x2e1b2138
    961 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
    962 DATA K256<>+0x10c(SB)/4, $0x53380d13
    963 DATA K256<>+0x110(SB)/4, $0x27b70a85
    964 DATA K256<>+0x114(SB)/4, $0x2e1b2138
    965 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
    966 DATA K256<>+0x11c(SB)/4, $0x53380d13
    967 
    968 DATA K256<>+0x120(SB)/4, $0x650a7354
    969 DATA K256<>+0x124(SB)/4, $0x766a0abb
    970 DATA K256<>+0x128(SB)/4, $0x81c2c92e
    971 DATA K256<>+0x12c(SB)/4, $0x92722c85
    972 DATA K256<>+0x130(SB)/4, $0x650a7354
    973 DATA K256<>+0x134(SB)/4, $0x766a0abb
    974 DATA K256<>+0x138(SB)/4, $0x81c2c92e
    975 DATA K256<>+0x13c(SB)/4, $0x92722c85
    976 
    977 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
    978 DATA K256<>+0x144(SB)/4, $0xa81a664b
    979 DATA K256<>+0x148(SB)/4, $0xc24b8b70
    980 DATA K256<>+0x14c(SB)/4, $0xc76c51a3
    981 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
    982 DATA K256<>+0x154(SB)/4, $0xa81a664b
    983 DATA K256<>+0x158(SB)/4, $0xc24b8b70
    984 DATA K256<>+0x15c(SB)/4, $0xc76c51a3
    985 
    986 DATA K256<>+0x160(SB)/4, $0xd192e819
    987 DATA K256<>+0x164(SB)/4, $0xd6990624
    988 DATA K256<>+0x168(SB)/4, $0xf40e3585
    989 DATA K256<>+0x16c(SB)/4, $0x106aa070
    990 DATA K256<>+0x170(SB)/4, $0xd192e819
    991 DATA K256<>+0x174(SB)/4, $0xd6990624
    992 DATA K256<>+0x178(SB)/4, $0xf40e3585
    993 DATA K256<>+0x17c(SB)/4, $0x106aa070
    994 
    995 DATA K256<>+0x180(SB)/4, $0x19a4c116
    996 DATA K256<>+0x184(SB)/4, $0x1e376c08
    997 DATA K256<>+0x188(SB)/4, $0x2748774c
    998 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
    999 DATA K256<>+0x190(SB)/4, $0x19a4c116
   1000 DATA K256<>+0x194(SB)/4, $0x1e376c08
   1001 DATA K256<>+0x198(SB)/4, $0x2748774c
   1002 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
   1003 
   1004 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
   1005 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
   1006 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
   1007 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
   1008 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
   1009 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
   1010 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
   1011 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
   1012 
   1013 DATA K256<>+0x1c0(SB)/4, $0x748f82ee
   1014 DATA K256<>+0x1c4(SB)/4, $0x78a5636f
   1015 DATA K256<>+0x1c8(SB)/4, $0x84c87814
   1016 DATA K256<>+0x1cc(SB)/4, $0x8cc70208
   1017 DATA K256<>+0x1d0(SB)/4, $0x748f82ee
   1018 DATA K256<>+0x1d4(SB)/4, $0x78a5636f
   1019 DATA K256<>+0x1d8(SB)/4, $0x84c87814
   1020 DATA K256<>+0x1dc(SB)/4, $0x8cc70208
   1021 
   1022 DATA K256<>+0x1e0(SB)/4, $0x90befffa
   1023 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
   1024 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
   1025 DATA K256<>+0x1ec(SB)/4, $0xc67178f2
   1026 DATA K256<>+0x1f0(SB)/4, $0x90befffa
   1027 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
   1028 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
   1029 DATA K256<>+0x1fc(SB)/4, $0xc67178f2
   1030 
   1031 GLOBL K256<>(SB), (NOPTR + RODATA), $512
   1032