Home | History | Annotate | Download | only in chacha20poly1305
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
      6 
      7 // +build go1.7,amd64,!gccgo,!appengine
      8 
      9 #include "textflag.h"
     10 // General register allocation
     11 #define oup DI
     12 #define inp SI
     13 #define inl BX
     14 #define adp CX // free to reuse, after we hash the additional data
     15 #define keyp R8 // free to reuse, when we copy the key to stack
     16 #define itr2 R9 // general iterator
     17 #define itr1 CX // general iterator
     18 #define acc0 R10
     19 #define acc1 R11
     20 #define acc2 R12
     21 #define t0 R13
     22 #define t1 R14
     23 #define t2 R15
     24 #define t3 R8
     25 // Register and stack allocation for the SSE code
     26 #define rStore (0*16)(BP)
     27 #define sStore (1*16)(BP)
     28 #define state1Store (2*16)(BP)
     29 #define state2Store (3*16)(BP)
     30 #define tmpStore (4*16)(BP)
     31 #define ctr0Store (5*16)(BP)
     32 #define ctr1Store (6*16)(BP)
     33 #define ctr2Store (7*16)(BP)
     34 #define ctr3Store (8*16)(BP)
     35 #define A0 X0
     36 #define A1 X1
     37 #define A2 X2
     38 #define B0 X3
     39 #define B1 X4
     40 #define B2 X5
     41 #define C0 X6
     42 #define C1 X7
     43 #define C2 X8
     44 #define D0 X9
     45 #define D1 X10
     46 #define D2 X11
     47 #define T0 X12
     48 #define T1 X13
     49 #define T2 X14
     50 #define T3 X15
     51 #define A3 T0
     52 #define B3 T1
     53 #define C3 T2
     54 #define D3 T3
     55 // Register and stack allocation for the AVX2 code
     56 #define rsStoreAVX2 (0*32)(BP)
     57 #define state1StoreAVX2 (1*32)(BP)
     58 #define state2StoreAVX2 (2*32)(BP)
     59 #define ctr0StoreAVX2 (3*32)(BP)
     60 #define ctr1StoreAVX2 (4*32)(BP)
     61 #define ctr2StoreAVX2 (5*32)(BP)
     62 #define ctr3StoreAVX2 (6*32)(BP)
     63 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
     64 #define AA0 Y0
     65 #define AA1 Y5
     66 #define AA2 Y6
     67 #define AA3 Y7
     68 #define BB0 Y14
     69 #define BB1 Y9
     70 #define BB2 Y10
     71 #define BB3 Y11
     72 #define CC0 Y12
     73 #define CC1 Y13
     74 #define CC2 Y8
     75 #define CC3 Y15
     76 #define DD0 Y4
     77 #define DD1 Y1
     78 #define DD2 Y2
     79 #define DD3 Y3
     80 #define TT0 DD3
     81 #define TT1 AA3
     82 #define TT2 BB3
     83 #define TT3 CC3
     84 // ChaCha20 constants
     85 DATA chacha20Constants<>+0x00(SB)/4, $0x61707865
     86 DATA chacha20Constants<>+0x04(SB)/4, $0x3320646e
     87 DATA chacha20Constants<>+0x08(SB)/4, $0x79622d32
     88 DATA chacha20Constants<>+0x0c(SB)/4, $0x6b206574
     89 DATA chacha20Constants<>+0x10(SB)/4, $0x61707865
     90 DATA chacha20Constants<>+0x14(SB)/4, $0x3320646e
     91 DATA chacha20Constants<>+0x18(SB)/4, $0x79622d32
     92 DATA chacha20Constants<>+0x1c(SB)/4, $0x6b206574
     93 // <<< 16 with PSHUFB
     94 DATA rol16<>+0x00(SB)/8, $0x0504070601000302
     95 DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
     96 DATA rol16<>+0x10(SB)/8, $0x0504070601000302
     97 DATA rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
     98 // <<< 8 with PSHUFB
     99 DATA rol8<>+0x00(SB)/8, $0x0605040702010003
    100 DATA rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
    101 DATA rol8<>+0x10(SB)/8, $0x0605040702010003
    102 DATA rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
    103 
    104 DATA avx2InitMask<>+0x00(SB)/8, $0x0
    105 DATA avx2InitMask<>+0x08(SB)/8, $0x0
    106 DATA avx2InitMask<>+0x10(SB)/8, $0x1
    107 DATA avx2InitMask<>+0x18(SB)/8, $0x0
    108 
    109 DATA avx2IncMask<>+0x00(SB)/8, $0x2
    110 DATA avx2IncMask<>+0x08(SB)/8, $0x0
    111 DATA avx2IncMask<>+0x10(SB)/8, $0x2
    112 DATA avx2IncMask<>+0x18(SB)/8, $0x0
    113 // Poly1305 key clamp
    114 DATA polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
    115 DATA polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
    116 DATA polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
    117 DATA polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
    118 
    119 DATA sseIncMask<>+0x00(SB)/8, $0x1
    120 DATA sseIncMask<>+0x08(SB)/8, $0x0
    121 // To load/store the last < 16 bytes in a buffer
    122 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    123 DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    124 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    125 DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    126 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    127 DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    128 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    129 DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    130 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    131 DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    132 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    133 DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    134 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    135 DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    136 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    137 DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    138 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    139 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    140 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    141 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    142 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    143 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    144 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    145 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    146 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    147 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    148 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    149 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    150 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    151 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    152 
    153 GLOBL chacha20Constants<>(SB), (NOPTR+RODATA), $32
    154 GLOBL rol16<>(SB), (NOPTR+RODATA), $32
    155 GLOBL rol8<>(SB), (NOPTR+RODATA), $32
    156 GLOBL sseIncMask<>(SB), (NOPTR+RODATA), $16
    157 GLOBL avx2IncMask<>(SB), (NOPTR+RODATA), $32
    158 GLOBL avx2InitMask<>(SB), (NOPTR+RODATA), $32
    159 GLOBL polyClampMask<>(SB), (NOPTR+RODATA), $32
    160 GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    161 // No PALIGNR in Go ASM yet (but VPALIGNR is present).
    162 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
    163 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
    164 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
    165 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
    166 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
    167 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
    168 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
    169 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
    170 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
    171 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
    172 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
    173 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
    174 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
    175 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
    176 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
    177 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
    178 #define shiftC0Right shiftC0Left
    179 #define shiftC1Right shiftC1Left
    180 #define shiftC2Right shiftC2Left
    181 #define shiftC3Right shiftC3Left
    182 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
    183 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
    184 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
    185 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
    186 // Some macros
    187 #define chachaQR(A, B, C, D, T) \
    188 	PADDD B, A; PXOR A, D; PSHUFB rol16<>(SB), D                            \
    189 	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
    190 	PADDD B, A; PXOR A, D; PSHUFB rol8<>(SB), D                             \
    191 	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
    192 
    193 #define chachaQR_AVX2(A, B, C, D, T) \
    194 	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol16<>(SB), D, D                         \
    195 	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
    196 	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol8<>(SB), D, D                          \
    197 	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
    198 
    199 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
    200 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
    201 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
    202 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
    203 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
    204 
    205 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
    206 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
    207 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
    208 
    209 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
    210 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
    211 // ----------------------------------------------------------------------------
    212 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
    213 	// adp points to beginning of additional data
    214 	// itr2 holds ad length
    215 	XORQ acc0, acc0
    216 	XORQ acc1, acc1
    217 	XORQ acc2, acc2
    218 	CMPQ itr2, $13
    219 	JNE  hashADLoop
    220 
    221 openFastTLSAD:
    222 	// Special treatment for the TLS case of 13 bytes
    223 	MOVQ (adp), acc0
    224 	MOVQ 5(adp), acc1
    225 	SHRQ $24, acc1
    226 	MOVQ $1, acc2
    227 	polyMul
    228 	RET
    229 
    230 hashADLoop:
    231 	// Hash in 16 byte chunks
    232 	CMPQ itr2, $16
    233 	JB   hashADTail
    234 	polyAdd(0(adp))
    235 	LEAQ (1*16)(adp), adp
    236 	SUBQ $16, itr2
    237 	polyMul
    238 	JMP  hashADLoop
    239 
    240 hashADTail:
    241 	CMPQ itr2, $0
    242 	JE   hashADDone
    243 
    244 	// Hash last < 16 byte tail
    245 	XORQ t0, t0
    246 	XORQ t1, t1
    247 	XORQ t2, t2
    248 	ADDQ itr2, adp
    249 
    250 hashADTailLoop:
    251 	SHLQ $8, t1:t0
    252 	SHLQ $8, t0
    253 	MOVB -1(adp), t2
    254 	XORQ t2, t0
    255 	DECQ adp
    256 	DECQ itr2
    257 	JNE  hashADTailLoop
    258 
    259 hashADTailFinish:
    260 	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
    261 	polyMul
    262 
    263 	// Finished AD
    264 hashADDone:
    265 	RET
    266 
    267 // ----------------------------------------------------------------------------
    268 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
    269 TEXT chacha20Poly1305Open(SB), 0, $288-97
    270 	// For aligned stack access
    271 	MOVQ SP, BP
    272 	ADDQ $32, BP
    273 	ANDQ $-32, BP
    274 	MOVQ dst+0(FP), oup
    275 	MOVQ key+24(FP), keyp
    276 	MOVQ src+48(FP), inp
    277 	MOVQ src_len+56(FP), inl
    278 	MOVQ ad+72(FP), adp
    279 
    280 	// Check for AVX2 support
    281 	CMPB runtimesupport_avx2(SB), $0
    282 	JE   noavx2bmi2Open
    283 
    284 	// Check BMI2 bit for MULXQ.
    285 	// runtimecpuid_ebx7 is always available here
    286 	// because it passed avx2 check
    287 	TESTL $(1<<8), runtimecpuid_ebx7(SB)
    288 	JNE   chacha20Poly1305Open_AVX2
    289 noavx2bmi2Open:
    290 
    291 	// Special optimization, for very short buffers
    292 	CMPQ inl, $128
    293 	JBE  openSSE128 // About 16% faster
    294 
    295 	// For long buffers, prepare the poly key first
    296 	MOVOU chacha20Constants<>(SB), A0
    297 	MOVOU (1*16)(keyp), B0
    298 	MOVOU (2*16)(keyp), C0
    299 	MOVOU (3*16)(keyp), D0
    300 	MOVO  D0, T1
    301 
    302 	// Store state on stack for future use
    303 	MOVO B0, state1Store
    304 	MOVO C0, state2Store
    305 	MOVO D0, ctr3Store
    306 	MOVQ $10, itr2
    307 
    308 openSSEPreparePolyKey:
    309 	chachaQR(A0, B0, C0, D0, T0)
    310 	shiftB0Left;  shiftC0Left; shiftD0Left
    311 	chachaQR(A0, B0, C0, D0, T0)
    312 	shiftB0Right; shiftC0Right; shiftD0Right
    313 	DECQ          itr2
    314 	JNE           openSSEPreparePolyKey
    315 
    316 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
    317 	PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0
    318 
    319 	// Clamp and store the key
    320 	PAND polyClampMask<>(SB), A0
    321 	MOVO A0, rStore; MOVO B0, sStore
    322 
    323 	// Hash AAD
    324 	MOVQ ad_len+80(FP), itr2
    325 	CALL polyHashADInternal<>(SB)
    326 
    327 openSSEMainLoop:
    328 	CMPQ inl, $256
    329 	JB   openSSEMainLoopDone
    330 
    331 	// Load state, increment counter blocks
    332 	MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
    333 	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
    334 	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
    335 	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
    336 
    337 	// Store counters
    338 	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
    339 
    340 	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
    341 	MOVQ $4, itr1
    342 	MOVQ inp, itr2
    343 
    344 openSSEInternalLoop:
    345 	MOVO          C3, tmpStore
    346 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
    347 	MOVO          tmpStore, C3
    348 	MOVO          C1, tmpStore
    349 	chachaQR(A3, B3, C3, D3, C1)
    350 	MOVO          tmpStore, C1
    351 	polyAdd(0(itr2))
    352 	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
    353 	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
    354 	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
    355 	polyMulStage1
    356 	polyMulStage2
    357 	LEAQ          (2*8)(itr2), itr2
    358 	MOVO          C3, tmpStore
    359 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
    360 	MOVO          tmpStore, C3
    361 	MOVO          C1, tmpStore
    362 	polyMulStage3
    363 	chachaQR(A3, B3, C3, D3, C1)
    364 	MOVO          tmpStore, C1
    365 	polyMulReduceStage
    366 	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
    367 	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
    368 	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
    369 	DECQ          itr1
    370 	JGE           openSSEInternalLoop
    371 
    372 	polyAdd(0(itr2))
    373 	polyMul
    374 	LEAQ (2*8)(itr2), itr2
    375 
    376 	CMPQ itr1, $-6
    377 	JG   openSSEInternalLoop
    378 
    379 	// Add in the state
    380 	PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
    381 	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
    382 	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
    383 	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
    384 
    385 	// Load - xor - store
    386 	MOVO  D3, tmpStore
    387 	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
    388 	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
    389 	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
    390 	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
    391 	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
    392 	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
    393 	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
    394 	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
    395 	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
    396 	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
    397 	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
    398 	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
    399 	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
    400 	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
    401 	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
    402 	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
    403 	LEAQ  256(inp), inp
    404 	LEAQ  256(oup), oup
    405 	SUBQ  $256, inl
    406 	JMP   openSSEMainLoop
    407 
    408 openSSEMainLoopDone:
    409 	// Handle the various tail sizes efficiently
    410 	TESTQ inl, inl
    411 	JE    openSSEFinalize
    412 	CMPQ  inl, $64
    413 	JBE   openSSETail64
    414 	CMPQ  inl, $128
    415 	JBE   openSSETail128
    416 	CMPQ  inl, $192
    417 	JBE   openSSETail192
    418 	JMP   openSSETail256
    419 
    420 openSSEFinalize:
    421 	// Hash in the PT, AAD lengths
    422 	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
    423 	polyMul
    424 
    425 	// Final reduce
    426 	MOVQ    acc0, t0
    427 	MOVQ    acc1, t1
    428 	MOVQ    acc2, t2
    429 	SUBQ    $-5, acc0
    430 	SBBQ    $-1, acc1
    431 	SBBQ    $3, acc2
    432 	CMOVQCS t0, acc0
    433 	CMOVQCS t1, acc1
    434 	CMOVQCS t2, acc2
    435 
    436 	// Add in the "s" part of the key
    437 	ADDQ 0+sStore, acc0
    438 	ADCQ 8+sStore, acc1
    439 
    440 	// Finally, constant time compare to the tag at the end of the message
    441 	XORQ    AX, AX
    442 	MOVQ    $1, DX
    443 	XORQ    (0*8)(inp), acc0
    444 	XORQ    (1*8)(inp), acc1
    445 	ORQ     acc1, acc0
    446 	CMOVQEQ DX, AX
    447 
    448 	// Return true iff tags are equal
    449 	MOVB AX, ret+96(FP)
    450 	RET
    451 
    452 // ----------------------------------------------------------------------------
    453 // Special optimization for buffers smaller than 129 bytes
    454 openSSE128:
    455 	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
    456 	MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
    457 	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
    458 	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
    459 	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
    460 	MOVQ  $10, itr2
    461 
    462 openSSE128InnerCipherLoop:
    463 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
    464 	shiftB0Left;  shiftB1Left; shiftB2Left
    465 	shiftC0Left;  shiftC1Left; shiftC2Left
    466 	shiftD0Left;  shiftD1Left; shiftD2Left
    467 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
    468 	shiftB0Right; shiftB1Right; shiftB2Right
    469 	shiftC0Right; shiftC1Right; shiftC2Right
    470 	shiftD0Right; shiftD1Right; shiftD2Right
    471 	DECQ          itr2
    472 	JNE           openSSE128InnerCipherLoop
    473 
    474 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
    475 	PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
    476 	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
    477 	PADDL T2, C1; PADDL T2, C2
    478 	PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2
    479 
    480 	// Clamp and store the key
    481 	PAND  polyClampMask<>(SB), A0
    482 	MOVOU A0, rStore; MOVOU B0, sStore
    483 
    484 	// Hash
    485 	MOVQ ad_len+80(FP), itr2
    486 	CALL polyHashADInternal<>(SB)
    487 
    488 openSSE128Open:
    489 	CMPQ inl, $16
    490 	JB   openSSETail16
    491 	SUBQ $16, inl
    492 
    493 	// Load for hashing
    494 	polyAdd(0(inp))
    495 
    496 	// Load for decryption
    497 	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
    498 	LEAQ  (1*16)(inp), inp
    499 	LEAQ  (1*16)(oup), oup
    500 	polyMul
    501 
    502 	// Shift the stream "left"
    503 	MOVO B1, A1
    504 	MOVO C1, B1
    505 	MOVO D1, C1
    506 	MOVO A2, D1
    507 	MOVO B2, A2
    508 	MOVO C2, B2
    509 	MOVO D2, C2
    510 	JMP  openSSE128Open
    511 
    512 openSSETail16:
    513 	TESTQ inl, inl
    514 	JE    openSSEFinalize
    515 
    516 	// We can safely load the CT from the end, because it is padded with the MAC
    517 	MOVQ   inl, itr2
    518 	SHLQ   $4, itr2
    519 	LEAQ   andMask<>(SB), t0
    520 	MOVOU  (inp), T0
    521 	ADDQ   inl, inp
    522 	PAND   -16(t0)(itr2*1), T0
    523 	MOVO   T0, 0+tmpStore
    524 	MOVQ   T0, t0
    525 	MOVQ   8+tmpStore, t1
    526 	PXOR   A1, T0
    527 
    528 	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
    529 openSSETail16Store:
    530 	MOVQ T0, t3
    531 	MOVB t3, (oup)
    532 	PSRLDQ $1, T0
    533 	INCQ   oup
    534 	DECQ   inl
    535 	JNE    openSSETail16Store
    536 	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
    537 	polyMul
    538 	JMP    openSSEFinalize
    539 
    540 // ----------------------------------------------------------------------------
    541 // Special optimization for the last 64 bytes of ciphertext
    542 openSSETail64:
    543 	// Need to decrypt up to 64 bytes - prepare single block
    544 	MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
    545 	XORQ itr2, itr2
    546 	MOVQ inl, itr1
    547 	CMPQ itr1, $16
    548 	JB   openSSETail64LoopB
    549 
    550 openSSETail64LoopA:
    551 	// Perform ChaCha rounds, while hashing the remaining input
    552 	polyAdd(0(inp)(itr2*1))
    553 	polyMul
    554 	SUBQ $16, itr1
    555 
    556 openSSETail64LoopB:
    557 	ADDQ          $16, itr2
    558 	chachaQR(A0, B0, C0, D0, T0)
    559 	shiftB0Left;  shiftC0Left; shiftD0Left
    560 	chachaQR(A0, B0, C0, D0, T0)
    561 	shiftB0Right; shiftC0Right; shiftD0Right
    562 
    563 	CMPQ itr1, $16
    564 	JAE  openSSETail64LoopA
    565 
    566 	CMPQ itr2, $160
    567 	JNE  openSSETail64LoopB
    568 
    569 	PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
    570 
    571 openSSETail64DecLoop:
    572 	CMPQ  inl, $16
    573 	JB    openSSETail64DecLoopDone
    574 	SUBQ  $16, inl
    575 	MOVOU (inp), T0
    576 	PXOR  T0, A0
    577 	MOVOU A0, (oup)
    578 	LEAQ  16(inp), inp
    579 	LEAQ  16(oup), oup
    580 	MOVO  B0, A0
    581 	MOVO  C0, B0
    582 	MOVO  D0, C0
    583 	JMP   openSSETail64DecLoop
    584 
    585 openSSETail64DecLoopDone:
    586 	MOVO A0, A1
    587 	JMP  openSSETail16
    588 
    589 // ----------------------------------------------------------------------------
    590 // Special optimization for the last 128 bytes of ciphertext
    591 openSSETail128:
    592 	// Need to decrypt up to 128 bytes - prepare two blocks
    593 	MOVO chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr0Store
    594 	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr1Store
    595 	XORQ itr2, itr2
    596 	MOVQ inl, itr1
    597 	ANDQ $-16, itr1
    598 
    599 openSSETail128LoopA:
    600 	// Perform ChaCha rounds, while hashing the remaining input
    601 	polyAdd(0(inp)(itr2*1))
    602 	polyMul
    603 
    604 openSSETail128LoopB:
    605 	ADDQ          $16, itr2
    606 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
    607 	shiftB0Left;  shiftC0Left; shiftD0Left
    608 	shiftB1Left;  shiftC1Left; shiftD1Left
    609 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
    610 	shiftB0Right; shiftC0Right; shiftD0Right
    611 	shiftB1Right; shiftC1Right; shiftD1Right
    612 
    613 	CMPQ itr2, itr1
    614 	JB   openSSETail128LoopA
    615 
    616 	CMPQ itr2, $160
    617 	JNE  openSSETail128LoopB
    618 
    619 	PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1
    620 	PADDL state1Store, B0; PADDL state1Store, B1
    621 	PADDL state2Store, C0; PADDL state2Store, C1
    622 	PADDL ctr1Store, D0; PADDL ctr0Store, D1
    623 
    624 	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
    625 	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
    626 	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
    627 
    628 	SUBQ $64, inl
    629 	LEAQ 64(inp), inp
    630 	LEAQ 64(oup), oup
    631 	JMP  openSSETail64DecLoop
    632 
    633 // ----------------------------------------------------------------------------
    634 // Special optimization for the last 192 bytes of ciphertext
    635 openSSETail192:
    636 	// Need to decrypt up to 192 bytes - prepare three blocks
    637 	MOVO chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr0Store
    638 	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
    639 	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr2Store
    640 
    641 	MOVQ    inl, itr1
    642 	MOVQ    $160, itr2
    643 	CMPQ    itr1, $160
    644 	CMOVQGT itr2, itr1
    645 	ANDQ    $-16, itr1
    646 	XORQ    itr2, itr2
    647 
    648 openSSLTail192LoopA:
    649 	// Perform ChaCha rounds, while hashing the remaining input
    650 	polyAdd(0(inp)(itr2*1))
    651 	polyMul
    652 
    653 openSSLTail192LoopB:
    654 	ADDQ         $16, itr2
    655 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
    656 	shiftB0Left; shiftC0Left; shiftD0Left
    657 	shiftB1Left; shiftC1Left; shiftD1Left
    658 	shiftB2Left; shiftC2Left; shiftD2Left
    659 
    660 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
    661 	shiftB0Right; shiftC0Right; shiftD0Right
    662 	shiftB1Right; shiftC1Right; shiftD1Right
    663 	shiftB2Right; shiftC2Right; shiftD2Right
    664 
    665 	CMPQ itr2, itr1
    666 	JB   openSSLTail192LoopA
    667 
    668 	CMPQ itr2, $160
    669 	JNE  openSSLTail192LoopB
    670 
    671 	CMPQ inl, $176
    672 	JB   openSSLTail192Store
    673 
    674 	polyAdd(160(inp))
    675 	polyMul
    676 
    677 	CMPQ inl, $192
    678 	JB   openSSLTail192Store
    679 
    680 	polyAdd(176(inp))
    681 	polyMul
    682 
    683 openSSLTail192Store:
    684 	PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
    685 	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
    686 	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
    687 	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
    688 
    689 	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
    690 	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
    691 	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
    692 
    693 	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
    694 	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
    695 	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
    696 
    697 	SUBQ $128, inl
    698 	LEAQ 128(inp), inp
    699 	LEAQ 128(oup), oup
    700 	JMP  openSSETail64DecLoop
    701 
    702 // ----------------------------------------------------------------------------
    703 // Special optimization for the last 256 bytes of ciphertext
    704 openSSETail256:
    705 	// Need to decrypt up to 256 bytes - prepare four blocks
    706 	MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
    707 	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
    708 	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
    709 	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
    710 
    711 	// Store counters
    712 	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
    713 	XORQ itr2, itr2
    714 
    715 openSSETail256Loop:
    716 	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
    717 	polyAdd(0(inp)(itr2*1))
    718 	MOVO          C3, tmpStore
    719 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
    720 	MOVO          tmpStore, C3
    721 	MOVO          C1, tmpStore
    722 	chachaQR(A3, B3, C3, D3, C1)
    723 	MOVO          tmpStore, C1
    724 	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
    725 	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
    726 	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
    727 	polyMulStage1
    728 	polyMulStage2
    729 	MOVO          C3, tmpStore
    730 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
    731 	MOVO          tmpStore, C3
    732 	MOVO          C1, tmpStore
    733 	chachaQR(A3, B3, C3, D3, C1)
    734 	MOVO          tmpStore, C1
    735 	polyMulStage3
    736 	polyMulReduceStage
    737 	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
    738 	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
    739 	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
    740 	ADDQ          $2*8, itr2
    741 	CMPQ          itr2, $160
    742 	JB            openSSETail256Loop
    743 	MOVQ          inl, itr1
    744 	ANDQ          $-16, itr1
    745 
    746 openSSETail256HashLoop:
    747 	polyAdd(0(inp)(itr2*1))
    748 	polyMul
    749 	ADDQ $2*8, itr2
    750 	CMPQ itr2, itr1
    751 	JB   openSSETail256HashLoop
    752 
    753 	// Add in the state
    754 	PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
    755 	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
    756 	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
    757 	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
    758 	MOVO  D3, tmpStore
    759 
    760 	// Load - xor - store
    761 	MOVOU (0*16)(inp), D3; PXOR D3, A0
    762 	MOVOU (1*16)(inp), D3; PXOR D3, B0
    763 	MOVOU (2*16)(inp), D3; PXOR D3, C0
    764 	MOVOU (3*16)(inp), D3; PXOR D3, D0
    765 	MOVOU A0, (0*16)(oup)
    766 	MOVOU B0, (1*16)(oup)
    767 	MOVOU C0, (2*16)(oup)
    768 	MOVOU D0, (3*16)(oup)
    769 	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
    770 	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
    771 	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
    772 	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
    773 	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
    774 	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
    775 	LEAQ  192(inp), inp
    776 	LEAQ  192(oup), oup
    777 	SUBQ  $192, inl
    778 	MOVO  A3, A0
    779 	MOVO  B3, B0
    780 	MOVO  C3, C0
    781 	MOVO  tmpStore, D0
    782 
    783 	JMP openSSETail64DecLoop
    784 
    785 // ----------------------------------------------------------------------------
    786 // ------------------------- AVX2 Code ----------------------------------------
    787 chacha20Poly1305Open_AVX2:
    788 	VZEROUPPER
    789 	VMOVDQU chacha20Constants<>(SB), AA0
    790 	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
    791 	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
    792 	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
    793 	VPADDD  avx2InitMask<>(SB), DD0, DD0
    794 
    795 	// Special optimization, for very short buffers
    796 	CMPQ inl, $192
    797 	JBE  openAVX2192
    798 	CMPQ inl, $320
    799 	JBE  openAVX2320
    800 
    801 	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
    802 	VMOVDQA BB0, state1StoreAVX2
    803 	VMOVDQA CC0, state2StoreAVX2
    804 	VMOVDQA DD0, ctr3StoreAVX2
    805 	MOVQ    $10, itr2
    806 
    807 openAVX2PreparePolyKey:
    808 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
    809 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
    810 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
    811 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
    812 	DECQ     itr2
    813 	JNE      openAVX2PreparePolyKey
    814 
    815 	VPADDD chacha20Constants<>(SB), AA0, AA0
    816 	VPADDD state1StoreAVX2, BB0, BB0
    817 	VPADDD state2StoreAVX2, CC0, CC0
    818 	VPADDD ctr3StoreAVX2, DD0, DD0
    819 
    820 	VPERM2I128 $0x02, AA0, BB0, TT0
    821 
    822 	// Clamp and store poly key
    823 	VPAND   polyClampMask<>(SB), TT0, TT0
    824 	VMOVDQA TT0, rsStoreAVX2
    825 
    826 	// Stream for the first 64 bytes
    827 	VPERM2I128 $0x13, AA0, BB0, AA0
    828 	VPERM2I128 $0x13, CC0, DD0, BB0
    829 
    830 	// Hash AD + first 64 bytes
    831 	MOVQ ad_len+80(FP), itr2
    832 	CALL polyHashADInternal<>(SB)
    833 	XORQ itr1, itr1
    834 
    835 openAVX2InitialHash64:
    836 	polyAdd(0(inp)(itr1*1))
    837 	polyMulAVX2
    838 	ADDQ $16, itr1
    839 	CMPQ itr1, $64
    840 	JNE  openAVX2InitialHash64
    841 
    842 	// Decrypt the first 64 bytes
    843 	VPXOR   (0*32)(inp), AA0, AA0
    844 	VPXOR   (1*32)(inp), BB0, BB0
    845 	VMOVDQU AA0, (0*32)(oup)
    846 	VMOVDQU BB0, (1*32)(oup)
    847 	LEAQ    (2*32)(inp), inp
    848 	LEAQ    (2*32)(oup), oup
    849 	SUBQ    $64, inl
    850 
    851 openAVX2MainLoop:
    852 	CMPQ inl, $512
    853 	JB   openAVX2MainLoopDone
    854 
    855 	// Load state, increment counter blocks, store the incremented counters
    856 	VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
    857 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
    858 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
    859 	VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
    860 	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
    861 	XORQ    itr1, itr1
    862 
    863 openAVX2InternalLoop:
    864 	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
    865 	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
    866 	polyAdd(0*8(inp)(itr1*1))
    867 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
    868 	polyMulStage1_AVX2
    869 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
    870 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
    871 	polyMulStage2_AVX2
    872 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
    873 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
    874 	polyMulStage3_AVX2
    875 	VMOVDQA  CC3, tmpStoreAVX2
    876 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
    877 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
    878 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
    879 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
    880 	VMOVDQA  tmpStoreAVX2, CC3
    881 	polyMulReduceStage
    882 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
    883 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
    884 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
    885 	polyAdd(2*8(inp)(itr1*1))
    886 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
    887 	polyMulStage1_AVX2
    888 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
    889 	VMOVDQA  CC3, tmpStoreAVX2
    890 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
    891 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
    892 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
    893 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
    894 	VMOVDQA  tmpStoreAVX2, CC3
    895 	polyMulStage2_AVX2
    896 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
    897 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
    898 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
    899 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
    900 	polyMulStage3_AVX2
    901 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
    902 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
    903 	polyMulReduceStage
    904 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
    905 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
    906 	polyAdd(4*8(inp)(itr1*1))
    907 	LEAQ     (6*8)(itr1), itr1
    908 	VMOVDQA  CC3, tmpStoreAVX2
    909 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
    910 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
    911 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
    912 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
    913 	VMOVDQA  tmpStoreAVX2, CC3
    914 	polyMulStage1_AVX2
    915 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
    916 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
    917 	polyMulStage2_AVX2
    918 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
    919 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
    920 	polyMulStage3_AVX2
    921 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
    922 	VMOVDQA  CC3, tmpStoreAVX2
    923 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
    924 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
    925 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
    926 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
    927 	VMOVDQA  tmpStoreAVX2, CC3
    928 	polyMulReduceStage
    929 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
    930 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
    931 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
    932 	CMPQ     itr1, $480
    933 	JNE      openAVX2InternalLoop
    934 
    935 	VPADDD  chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
    936 	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
    937 	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
    938 	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
    939 	VMOVDQA CC3, tmpStoreAVX2
    940 
    941 	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
    942 	polyAdd(480(inp))
    943 	polyMulAVX2
    944 	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
    945 	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
    946 	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
    947 	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
    948 	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
    949 	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
    950 
    951 	// and here
    952 	polyAdd(496(inp))
    953 	polyMulAVX2
    954 	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
    955 	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
    956 	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
    957 	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
    958 	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
    959 	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
    960 	LEAQ       (32*16)(inp), inp
    961 	LEAQ       (32*16)(oup), oup
    962 	SUBQ       $(32*16), inl
    963 	JMP        openAVX2MainLoop
    964 
    965 openAVX2MainLoopDone:
    966 	// Handle the various tail sizes efficiently
    967 	TESTQ inl, inl
    968 	JE    openSSEFinalize
    969 	CMPQ  inl, $128
    970 	JBE   openAVX2Tail128
    971 	CMPQ  inl, $256
    972 	JBE   openAVX2Tail256
    973 	CMPQ  inl, $384
    974 	JBE   openAVX2Tail384
    975 	JMP   openAVX2Tail512
    976 
    977 // ----------------------------------------------------------------------------
    978 // Special optimization for buffers smaller than 193 bytes
    979 openAVX2192:
    980 	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
    981 	VMOVDQA AA0, AA1
    982 	VMOVDQA BB0, BB1
    983 	VMOVDQA CC0, CC1
    984 	VPADDD  avx2IncMask<>(SB), DD0, DD1
    985 	VMOVDQA AA0, AA2
    986 	VMOVDQA BB0, BB2
    987 	VMOVDQA CC0, CC2
    988 	VMOVDQA DD0, DD2
    989 	VMOVDQA DD1, TT3
    990 	MOVQ    $10, itr2
    991 
    992 openAVX2192InnerCipherLoop:
    993 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
    994 	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
    995 	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
    996 	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
    997 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
    998 	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
    999 	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   1000 	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   1001 	DECQ       itr2
   1002 	JNE        openAVX2192InnerCipherLoop
   1003 	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
   1004 	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
   1005 	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
   1006 	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
   1007 	VPERM2I128 $0x02, AA0, BB0, TT0
   1008 
   1009 	// Clamp and store poly key
   1010 	VPAND   polyClampMask<>(SB), TT0, TT0
   1011 	VMOVDQA TT0, rsStoreAVX2
   1012 
   1013 	// Stream for up to 192 bytes
   1014 	VPERM2I128 $0x13, AA0, BB0, AA0
   1015 	VPERM2I128 $0x13, CC0, DD0, BB0
   1016 	VPERM2I128 $0x02, AA1, BB1, CC0
   1017 	VPERM2I128 $0x02, CC1, DD1, DD0
   1018 	VPERM2I128 $0x13, AA1, BB1, AA1
   1019 	VPERM2I128 $0x13, CC1, DD1, BB1
   1020 
   1021 openAVX2ShortOpen:
   1022 	// Hash
   1023 	MOVQ ad_len+80(FP), itr2
   1024 	CALL polyHashADInternal<>(SB)
   1025 
   1026 openAVX2ShortOpenLoop:
   1027 	CMPQ inl, $32
   1028 	JB   openAVX2ShortTail32
   1029 	SUBQ $32, inl
   1030 
   1031 	// Load for hashing
   1032 	polyAdd(0*8(inp))
   1033 	polyMulAVX2
   1034 	polyAdd(2*8(inp))
   1035 	polyMulAVX2
   1036 
   1037 	// Load for decryption
   1038 	VPXOR   (inp), AA0, AA0
   1039 	VMOVDQU AA0, (oup)
   1040 	LEAQ    (1*32)(inp), inp
   1041 	LEAQ    (1*32)(oup), oup
   1042 
   1043 	// Shift stream left
   1044 	VMOVDQA BB0, AA0
   1045 	VMOVDQA CC0, BB0
   1046 	VMOVDQA DD0, CC0
   1047 	VMOVDQA AA1, DD0
   1048 	VMOVDQA BB1, AA1
   1049 	VMOVDQA CC1, BB1
   1050 	VMOVDQA DD1, CC1
   1051 	VMOVDQA AA2, DD1
   1052 	VMOVDQA BB2, AA2
   1053 	JMP     openAVX2ShortOpenLoop
   1054 
   1055 openAVX2ShortTail32:
   1056 	CMPQ    inl, $16
   1057 	VMOVDQA A0, A1
   1058 	JB      openAVX2ShortDone
   1059 
   1060 	SUBQ $16, inl
   1061 
   1062 	// Load for hashing
   1063 	polyAdd(0*8(inp))
   1064 	polyMulAVX2
   1065 
   1066 	// Load for decryption
   1067 	VPXOR      (inp), A0, T0
   1068 	VMOVDQU    T0, (oup)
   1069 	LEAQ       (1*16)(inp), inp
   1070 	LEAQ       (1*16)(oup), oup
   1071 	VPERM2I128 $0x11, AA0, AA0, AA0
   1072 	VMOVDQA    A0, A1
   1073 
   1074 openAVX2ShortDone:
   1075 	VZEROUPPER
   1076 	JMP openSSETail16
   1077 
   1078 // ----------------------------------------------------------------------------
   1079 // Special optimization for buffers smaller than 321 bytes
   1080 openAVX2320:
   1081 	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
   1082 	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1
   1083 	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2
   1084 	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
   1085 	MOVQ    $10, itr2
   1086 
   1087 openAVX2320InnerCipherLoop:
   1088 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   1089 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
   1090 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   1091 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
   1092 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   1093 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
   1094 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   1095 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
   1096 	DECQ     itr2
   1097 	JNE      openAVX2320InnerCipherLoop
   1098 
   1099 	VMOVDQA chacha20Constants<>(SB), TT0
   1100 	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
   1101 	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
   1102 	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
   1103 	VMOVDQA avx2IncMask<>(SB), TT0
   1104 	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
   1105 	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
   1106 	VPADDD  TT3, DD2, DD2
   1107 
   1108 	// Clamp and store poly key
   1109 	VPERM2I128 $0x02, AA0, BB0, TT0
   1110 	VPAND      polyClampMask<>(SB), TT0, TT0
   1111 	VMOVDQA    TT0, rsStoreAVX2
   1112 
   1113 	// Stream for up to 320 bytes
   1114 	VPERM2I128 $0x13, AA0, BB0, AA0
   1115 	VPERM2I128 $0x13, CC0, DD0, BB0
   1116 	VPERM2I128 $0x02, AA1, BB1, CC0
   1117 	VPERM2I128 $0x02, CC1, DD1, DD0
   1118 	VPERM2I128 $0x13, AA1, BB1, AA1
   1119 	VPERM2I128 $0x13, CC1, DD1, BB1
   1120 	VPERM2I128 $0x02, AA2, BB2, CC1
   1121 	VPERM2I128 $0x02, CC2, DD2, DD1
   1122 	VPERM2I128 $0x13, AA2, BB2, AA2
   1123 	VPERM2I128 $0x13, CC2, DD2, BB2
   1124 	JMP        openAVX2ShortOpen
   1125 
   1126 // ----------------------------------------------------------------------------
   1127 // Special optimization for the last 128 bytes of ciphertext
   1128 openAVX2Tail128:
   1129 	// Need to decrypt up to 128 bytes - prepare two blocks
   1130 	VMOVDQA chacha20Constants<>(SB), AA1
   1131 	VMOVDQA state1StoreAVX2, BB1
   1132 	VMOVDQA state2StoreAVX2, CC1
   1133 	VMOVDQA ctr3StoreAVX2, DD1
   1134 	VPADDD  avx2IncMask<>(SB), DD1, DD1
   1135 	VMOVDQA DD1, DD0
   1136 
   1137 	XORQ  itr2, itr2
   1138 	MOVQ  inl, itr1
   1139 	ANDQ  $-16, itr1
   1140 	TESTQ itr1, itr1
   1141 	JE    openAVX2Tail128LoopB
   1142 
   1143 openAVX2Tail128LoopA:
   1144 	// Perform ChaCha rounds, while hashing the remaining input
   1145 	polyAdd(0(inp)(itr2*1))
   1146 	polyMulAVX2
   1147 
   1148 openAVX2Tail128LoopB:
   1149 	ADDQ     $16, itr2
   1150 	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   1151 	VPALIGNR $4, BB1, BB1, BB1
   1152 	VPALIGNR $8, CC1, CC1, CC1
   1153 	VPALIGNR $12, DD1, DD1, DD1
   1154 	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   1155 	VPALIGNR $12, BB1, BB1, BB1
   1156 	VPALIGNR $8, CC1, CC1, CC1
   1157 	VPALIGNR $4, DD1, DD1, DD1
   1158 	CMPQ     itr2, itr1
   1159 	JB       openAVX2Tail128LoopA
   1160 	CMPQ     itr2, $160
   1161 	JNE      openAVX2Tail128LoopB
   1162 
   1163 	VPADDD     chacha20Constants<>(SB), AA1, AA1
   1164 	VPADDD     state1StoreAVX2, BB1, BB1
   1165 	VPADDD     state2StoreAVX2, CC1, CC1
   1166 	VPADDD     DD0, DD1, DD1
   1167 	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   1168 
   1169 openAVX2TailLoop:
   1170 	CMPQ inl, $32
   1171 	JB   openAVX2Tail
   1172 	SUBQ $32, inl
   1173 
   1174 	// Load for decryption
   1175 	VPXOR   (inp), AA0, AA0
   1176 	VMOVDQU AA0, (oup)
   1177 	LEAQ    (1*32)(inp), inp
   1178 	LEAQ    (1*32)(oup), oup
   1179 	VMOVDQA BB0, AA0
   1180 	VMOVDQA CC0, BB0
   1181 	VMOVDQA DD0, CC0
   1182 	JMP     openAVX2TailLoop
   1183 
   1184 openAVX2Tail:
   1185 	CMPQ    inl, $16
   1186 	VMOVDQA A0, A1
   1187 	JB      openAVX2TailDone
   1188 	SUBQ    $16, inl
   1189 
   1190 	// Load for decryption
   1191 	VPXOR      (inp), A0, T0
   1192 	VMOVDQU    T0, (oup)
   1193 	LEAQ       (1*16)(inp), inp
   1194 	LEAQ       (1*16)(oup), oup
   1195 	VPERM2I128 $0x11, AA0, AA0, AA0
   1196 	VMOVDQA    A0, A1
   1197 
   1198 openAVX2TailDone:
   1199 	VZEROUPPER
   1200 	JMP openSSETail16
   1201 
   1202 // ----------------------------------------------------------------------------
   1203 // Special optimization for the last 256 bytes of ciphertext
   1204 openAVX2Tail256:
   1205 	// Need to decrypt up to 256 bytes - prepare four blocks
   1206 	VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
   1207 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
   1208 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
   1209 	VMOVDQA ctr3StoreAVX2, DD0
   1210 	VPADDD  avx2IncMask<>(SB), DD0, DD0
   1211 	VPADDD  avx2IncMask<>(SB), DD0, DD1
   1212 	VMOVDQA DD0, TT1
   1213 	VMOVDQA DD1, TT2
   1214 
   1215 	// Compute the number of iterations that will hash data
   1216 	MOVQ    inl, tmpStoreAVX2
   1217 	MOVQ    inl, itr1
   1218 	SUBQ    $128, itr1
   1219 	SHRQ    $4, itr1
   1220 	MOVQ    $10, itr2
   1221 	CMPQ    itr1, $10
   1222 	CMOVQGT itr2, itr1
   1223 	MOVQ    inp, inl
   1224 	XORQ    itr2, itr2
   1225 
   1226 openAVX2Tail256LoopA:
   1227 	polyAdd(0(inl))
   1228 	polyMulAVX2
   1229 	LEAQ 16(inl), inl
   1230 
   1231 	// Perform ChaCha rounds, while hashing the remaining input
   1232 openAVX2Tail256LoopB:
   1233 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   1234 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   1235 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   1236 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   1237 	INCQ     itr2
   1238 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   1239 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   1240 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   1241 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   1242 	CMPQ     itr2, itr1
   1243 	JB       openAVX2Tail256LoopA
   1244 
   1245 	CMPQ itr2, $10
   1246 	JNE  openAVX2Tail256LoopB
   1247 
   1248 	MOVQ inl, itr2
   1249 	SUBQ inp, inl
   1250 	MOVQ inl, itr1
   1251 	MOVQ tmpStoreAVX2, inl
   1252 
   1253 	// Hash the remainder of data (if any)
   1254 openAVX2Tail256Hash:
   1255 	ADDQ $16, itr1
   1256 	CMPQ itr1, inl
   1257 	JGT  openAVX2Tail256HashEnd
   1258 	polyAdd (0(itr2))
   1259 	polyMulAVX2
   1260 	LEAQ 16(itr2), itr2
   1261 	JMP  openAVX2Tail256Hash
   1262 
   1263 // Store 128 bytes safely, then go to store loop
   1264 openAVX2Tail256HashEnd:
   1265 	VPADDD     chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1
   1266 	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
   1267 	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
   1268 	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
   1269 	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
   1270 	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   1271 
   1272 	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
   1273 	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
   1274 	LEAQ    (4*32)(inp), inp
   1275 	LEAQ    (4*32)(oup), oup
   1276 	SUBQ    $4*32, inl
   1277 
   1278 	JMP openAVX2TailLoop
   1279 
   1280 // ----------------------------------------------------------------------------
   1281 // Special optimization for the last 384 bytes of ciphertext
   1282 openAVX2Tail384:
   1283 	// Need to decrypt up to 384 bytes - prepare six blocks
   1284 	VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
   1285 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
   1286 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
   1287 	VMOVDQA ctr3StoreAVX2, DD0
   1288 	VPADDD  avx2IncMask<>(SB), DD0, DD0
   1289 	VPADDD  avx2IncMask<>(SB), DD0, DD1
   1290 	VPADDD  avx2IncMask<>(SB), DD1, DD2
   1291 	VMOVDQA DD0, ctr0StoreAVX2
   1292 	VMOVDQA DD1, ctr1StoreAVX2
   1293 	VMOVDQA DD2, ctr2StoreAVX2
   1294 
   1295 	// Compute the number of iterations that will hash two blocks of data
   1296 	MOVQ    inl, tmpStoreAVX2
   1297 	MOVQ    inl, itr1
   1298 	SUBQ    $256, itr1
   1299 	SHRQ    $4, itr1
   1300 	ADDQ    $6, itr1
   1301 	MOVQ    $10, itr2
   1302 	CMPQ    itr1, $10
   1303 	CMOVQGT itr2, itr1
   1304 	MOVQ    inp, inl
   1305 	XORQ    itr2, itr2
   1306 
   1307 	// Perform ChaCha rounds, while hashing the remaining input
   1308 openAVX2Tail384LoopB:
   1309 	polyAdd(0(inl))
   1310 	polyMulAVX2
   1311 	LEAQ 16(inl), inl
   1312 
   1313 openAVX2Tail384LoopA:
   1314 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   1315 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
   1316 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   1317 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
   1318 	polyAdd(0(inl))
   1319 	polyMulAVX2
   1320 	LEAQ     16(inl), inl
   1321 	INCQ     itr2
   1322 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   1323 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
   1324 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   1325 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
   1326 
   1327 	CMPQ itr2, itr1
   1328 	JB   openAVX2Tail384LoopB
   1329 
   1330 	CMPQ itr2, $10
   1331 	JNE  openAVX2Tail384LoopA
   1332 
   1333 	MOVQ inl, itr2
   1334 	SUBQ inp, inl
   1335 	MOVQ inl, itr1
   1336 	MOVQ tmpStoreAVX2, inl
   1337 
   1338 openAVX2Tail384Hash:
   1339 	ADDQ $16, itr1
   1340 	CMPQ itr1, inl
   1341 	JGT  openAVX2Tail384HashEnd
   1342 	polyAdd(0(itr2))
   1343 	polyMulAVX2
   1344 	LEAQ 16(itr2), itr2
   1345 	JMP  openAVX2Tail384Hash
   1346 
   1347 // Store 256 bytes safely, then go to store loop
   1348 openAVX2Tail384HashEnd:
   1349 	VPADDD     chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2
   1350 	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
   1351 	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
   1352 	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
   1353 	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
   1354 	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
   1355 	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
   1356 	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
   1357 	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
   1358 	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
   1359 	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   1360 	LEAQ       (8*32)(inp), inp
   1361 	LEAQ       (8*32)(oup), oup
   1362 	SUBQ       $8*32, inl
   1363 	JMP        openAVX2TailLoop
   1364 
   1365 // ----------------------------------------------------------------------------
   1366 // Special optimization for the last 512 bytes of ciphertext
   1367 openAVX2Tail512:
   1368 	VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   1369 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   1370 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   1371 	VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
   1372 	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   1373 	XORQ    itr1, itr1
   1374 	MOVQ    inp, itr2
   1375 
   1376 openAVX2Tail512LoopB:
   1377 	polyAdd(0(itr2))
   1378 	polyMulAVX2
   1379 	LEAQ (2*8)(itr2), itr2
   1380 
   1381 openAVX2Tail512LoopA:
   1382 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   1383 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   1384 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   1385 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   1386 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   1387 	VMOVDQA  CC3, tmpStoreAVX2
   1388 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   1389 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   1390 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   1391 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   1392 	VMOVDQA  tmpStoreAVX2, CC3
   1393 	polyAdd(0*8(itr2))
   1394 	polyMulAVX2
   1395 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   1396 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   1397 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
   1398 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   1399 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   1400 	VMOVDQA  CC3, tmpStoreAVX2
   1401 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   1402 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   1403 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   1404 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   1405 	VMOVDQA  tmpStoreAVX2, CC3
   1406 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   1407 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   1408 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   1409 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   1410 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   1411 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   1412 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   1413 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   1414 	polyAdd(2*8(itr2))
   1415 	polyMulAVX2
   1416 	LEAQ     (4*8)(itr2), itr2
   1417 	VMOVDQA  CC3, tmpStoreAVX2
   1418 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   1419 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   1420 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   1421 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   1422 	VMOVDQA  tmpStoreAVX2, CC3
   1423 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   1424 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   1425 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
   1426 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   1427 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   1428 	VMOVDQA  CC3, tmpStoreAVX2
   1429 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   1430 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   1431 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   1432 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   1433 	VMOVDQA  tmpStoreAVX2, CC3
   1434 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   1435 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   1436 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   1437 	INCQ     itr1
   1438 	CMPQ     itr1, $4
   1439 	JLT      openAVX2Tail512LoopB
   1440 
   1441 	CMPQ itr1, $10
   1442 	JNE  openAVX2Tail512LoopA
   1443 
   1444 	MOVQ inl, itr1
   1445 	SUBQ $384, itr1
   1446 	ANDQ $-16, itr1
   1447 
   1448 openAVX2Tail512HashLoop:
   1449 	TESTQ itr1, itr1
   1450 	JE    openAVX2Tail512HashEnd
   1451 	polyAdd(0(itr2))
   1452 	polyMulAVX2
   1453 	LEAQ  16(itr2), itr2
   1454 	SUBQ  $16, itr1
   1455 	JMP   openAVX2Tail512HashLoop
   1456 
   1457 openAVX2Tail512HashEnd:
   1458 	VPADDD     chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
   1459 	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   1460 	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   1461 	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   1462 	VMOVDQA    CC3, tmpStoreAVX2
   1463 	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   1464 	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   1465 	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   1466 	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   1467 	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   1468 	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   1469 	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   1470 	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   1471 	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   1472 	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   1473 
   1474 	LEAQ (12*32)(inp), inp
   1475 	LEAQ (12*32)(oup), oup
   1476 	SUBQ $12*32, inl
   1477 
   1478 	JMP openAVX2TailLoop
   1479 
   1480 // ----------------------------------------------------------------------------
   1481 // ----------------------------------------------------------------------------
   1482 // func chacha20Poly1305Seal(dst, key, src, ad []byte)
   1483 TEXT chacha20Poly1305Seal(SB), 0, $288-96
   1484 	// For aligned stack access
   1485 	MOVQ SP, BP
   1486 	ADDQ $32, BP
   1487 	ANDQ $-32, BP
   1488 	MOVQ dst+0(FP), oup
   1489 	MOVQ key+24(FP), keyp
   1490 	MOVQ src+48(FP), inp
   1491 	MOVQ src_len+56(FP), inl
   1492 	MOVQ ad+72(FP), adp
   1493 
   1494 	// Check for AVX2 support
   1495 	CMPB runtimesupport_avx2(SB), $0
   1496 	JE   noavx2bmi2Seal
   1497 
   1498 	// Check BMI2 bit for MULXQ.
   1499 	// runtimecpuid_ebx7 is always available here
   1500 	// because it passed avx2 check
   1501 	TESTL $(1<<8), runtimecpuid_ebx7(SB)
   1502 	JNE   chacha20Poly1305Seal_AVX2
   1503 noavx2bmi2Seal:
   1504 
   1505 	// Special optimization, for very short buffers
   1506 	CMPQ inl, $128
   1507 	JBE  sealSSE128 // About 15% faster
   1508 
   1509 	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
   1510 	MOVOU chacha20Constants<>(SB), A0
   1511 	MOVOU (1*16)(keyp), B0
   1512 	MOVOU (2*16)(keyp), C0
   1513 	MOVOU (3*16)(keyp), D0
   1514 
   1515 	// Store state on stack for future use
   1516 	MOVO B0, state1Store
   1517 	MOVO C0, state2Store
   1518 
   1519 	// Load state, increment counter blocks
   1520 	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
   1521 	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
   1522 	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
   1523 
   1524 	// Store counters
   1525 	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   1526 	MOVQ $10, itr2
   1527 
   1528 sealSSEIntroLoop:
   1529 	MOVO         C3, tmpStore
   1530 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   1531 	MOVO         tmpStore, C3
   1532 	MOVO         C1, tmpStore
   1533 	chachaQR(A3, B3, C3, D3, C1)
   1534 	MOVO         tmpStore, C1
   1535 	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
   1536 	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
   1537 	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
   1538 
   1539 	MOVO          C3, tmpStore
   1540 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   1541 	MOVO          tmpStore, C3
   1542 	MOVO          C1, tmpStore
   1543 	chachaQR(A3, B3, C3, D3, C1)
   1544 	MOVO          tmpStore, C1
   1545 	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   1546 	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   1547 	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   1548 	DECQ          itr2
   1549 	JNE           sealSSEIntroLoop
   1550 
   1551 	// Add in the state
   1552 	PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
   1553 	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   1554 	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   1555 	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   1556 
   1557 	// Clamp and store the key
   1558 	PAND polyClampMask<>(SB), A0
   1559 	MOVO A0, rStore
   1560 	MOVO B0, sStore
   1561 
   1562 	// Hash AAD
   1563 	MOVQ ad_len+80(FP), itr2
   1564 	CALL polyHashADInternal<>(SB)
   1565 
   1566 	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
   1567 	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   1568 	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
   1569 	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   1570 	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   1571 	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
   1572 
   1573 	MOVQ $128, itr1
   1574 	SUBQ $128, inl
   1575 	LEAQ 128(inp), inp
   1576 
   1577 	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
   1578 
   1579 	CMPQ inl, $64
   1580 	JBE  sealSSE128SealHash
   1581 
   1582 	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
   1583 	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
   1584 	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
   1585 
   1586 	ADDQ $64, itr1
   1587 	SUBQ $64, inl
   1588 	LEAQ 64(inp), inp
   1589 
   1590 	MOVQ $2, itr1
   1591 	MOVQ $8, itr2
   1592 
   1593 	CMPQ inl, $64
   1594 	JBE  sealSSETail64
   1595 	CMPQ inl, $128
   1596 	JBE  sealSSETail128
   1597 	CMPQ inl, $192
   1598 	JBE  sealSSETail192
   1599 
   1600 sealSSEMainLoop:
   1601 	// Load state, increment counter blocks
   1602 	MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
   1603 	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
   1604 	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
   1605 	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
   1606 
   1607 	// Store counters
   1608 	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   1609 
   1610 sealSSEInnerLoop:
   1611 	MOVO          C3, tmpStore
   1612 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   1613 	MOVO          tmpStore, C3
   1614 	MOVO          C1, tmpStore
   1615 	chachaQR(A3, B3, C3, D3, C1)
   1616 	MOVO          tmpStore, C1
   1617 	polyAdd(0(oup))
   1618 	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   1619 	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   1620 	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   1621 	polyMulStage1
   1622 	polyMulStage2
   1623 	LEAQ          (2*8)(oup), oup
   1624 	MOVO          C3, tmpStore
   1625 	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   1626 	MOVO          tmpStore, C3
   1627 	MOVO          C1, tmpStore
   1628 	polyMulStage3
   1629 	chachaQR(A3, B3, C3, D3, C1)
   1630 	MOVO          tmpStore, C1
   1631 	polyMulReduceStage
   1632 	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   1633 	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   1634 	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   1635 	DECQ          itr2
   1636 	JGE           sealSSEInnerLoop
   1637 	polyAdd(0(oup))
   1638 	polyMul
   1639 	LEAQ          (2*8)(oup), oup
   1640 	DECQ          itr1
   1641 	JG            sealSSEInnerLoop
   1642 
   1643 	// Add in the state
   1644 	PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
   1645 	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   1646 	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   1647 	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   1648 	MOVO  D3, tmpStore
   1649 
   1650 	// Load - xor - store
   1651 	MOVOU (0*16)(inp), D3; PXOR D3, A0
   1652 	MOVOU (1*16)(inp), D3; PXOR D3, B0
   1653 	MOVOU (2*16)(inp), D3; PXOR D3, C0
   1654 	MOVOU (3*16)(inp), D3; PXOR D3, D0
   1655 	MOVOU A0, (0*16)(oup)
   1656 	MOVOU B0, (1*16)(oup)
   1657 	MOVOU C0, (2*16)(oup)
   1658 	MOVOU D0, (3*16)(oup)
   1659 	MOVO  tmpStore, D3
   1660 
   1661 	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   1662 	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   1663 	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   1664 	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
   1665 	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   1666 	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
   1667 	ADDQ  $192, inp
   1668 	MOVQ  $192, itr1
   1669 	SUBQ  $192, inl
   1670 	MOVO  A3, A1
   1671 	MOVO  B3, B1
   1672 	MOVO  C3, C1
   1673 	MOVO  D3, D1
   1674 	CMPQ  inl, $64
   1675 	JBE   sealSSE128SealHash
   1676 	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
   1677 	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
   1678 	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
   1679 	LEAQ  64(inp), inp
   1680 	SUBQ  $64, inl
   1681 	MOVQ  $6, itr1
   1682 	MOVQ  $4, itr2
   1683 	CMPQ  inl, $192
   1684 	JG    sealSSEMainLoop
   1685 
   1686 	MOVQ  inl, itr1
   1687 	TESTQ inl, inl
   1688 	JE    sealSSE128SealHash
   1689 	MOVQ  $6, itr1
   1690 	CMPQ  inl, $64
   1691 	JBE   sealSSETail64
   1692 	CMPQ  inl, $128
   1693 	JBE   sealSSETail128
   1694 	JMP   sealSSETail192
   1695 
   1696 // ----------------------------------------------------------------------------
   1697 // Special optimization for the last 64 bytes of plaintext
   1698 sealSSETail64:
   1699 	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
   1700 	MOVO  chacha20Constants<>(SB), A1
   1701 	MOVO  state1Store, B1
   1702 	MOVO  state2Store, C1
   1703 	MOVO  ctr3Store, D1
   1704 	PADDL sseIncMask<>(SB), D1
   1705 	MOVO  D1, ctr0Store
   1706 
   1707 sealSSETail64LoopA:
   1708 	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
   1709 	polyAdd(0(oup))
   1710 	polyMul
   1711 	LEAQ 16(oup), oup
   1712 
   1713 sealSSETail64LoopB:
   1714 	chachaQR(A1, B1, C1, D1, T1)
   1715 	shiftB1Left;  shiftC1Left; shiftD1Left
   1716 	chachaQR(A1, B1, C1, D1, T1)
   1717 	shiftB1Right; shiftC1Right; shiftD1Right
   1718 	polyAdd(0(oup))
   1719 	polyMul
   1720 	LEAQ          16(oup), oup
   1721 
   1722 	DECQ itr1
   1723 	JG   sealSSETail64LoopA
   1724 
   1725 	DECQ  itr2
   1726 	JGE   sealSSETail64LoopB
   1727 	PADDL chacha20Constants<>(SB), A1
   1728 	PADDL state1Store, B1
   1729 	PADDL state2Store, C1
   1730 	PADDL ctr0Store, D1
   1731 
   1732 	JMP sealSSE128Seal
   1733 
   1734 // ----------------------------------------------------------------------------
   1735 // Special optimization for the last 128 bytes of plaintext
   1736 sealSSETail128:
   1737 	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
   1738 	MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   1739 	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   1740 
   1741 sealSSETail128LoopA:
   1742 	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
   1743 	polyAdd(0(oup))
   1744 	polyMul
   1745 	LEAQ 16(oup), oup
   1746 
   1747 sealSSETail128LoopB:
   1748 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   1749 	shiftB0Left;  shiftC0Left; shiftD0Left
   1750 	shiftB1Left;  shiftC1Left; shiftD1Left
   1751 	polyAdd(0(oup))
   1752 	polyMul
   1753 	LEAQ          16(oup), oup
   1754 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   1755 	shiftB0Right; shiftC0Right; shiftD0Right
   1756 	shiftB1Right; shiftC1Right; shiftD1Right
   1757 
   1758 	DECQ itr1
   1759 	JG   sealSSETail128LoopA
   1760 
   1761 	DECQ itr2
   1762 	JGE  sealSSETail128LoopB
   1763 
   1764 	PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1
   1765 	PADDL state1Store, B0; PADDL state1Store, B1
   1766 	PADDL state2Store, C0; PADDL state2Store, C1
   1767 	PADDL ctr0Store, D0; PADDL ctr1Store, D1
   1768 
   1769 	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   1770 	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
   1771 	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
   1772 
   1773 	MOVQ $64, itr1
   1774 	LEAQ 64(inp), inp
   1775 	SUBQ $64, inl
   1776 
   1777 	JMP sealSSE128SealHash
   1778 
   1779 // ----------------------------------------------------------------------------
   1780 // Special optimization for the last 192 bytes of plaintext
   1781 sealSSETail192:
   1782 	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
   1783 	MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   1784 	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   1785 	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr2Store
   1786 
   1787 sealSSETail192LoopA:
   1788 	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
   1789 	polyAdd(0(oup))
   1790 	polyMul
   1791 	LEAQ 16(oup), oup
   1792 
   1793 sealSSETail192LoopB:
   1794 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   1795 	shiftB0Left; shiftC0Left; shiftD0Left
   1796 	shiftB1Left; shiftC1Left; shiftD1Left
   1797 	shiftB2Left; shiftC2Left; shiftD2Left
   1798 
   1799 	polyAdd(0(oup))
   1800 	polyMul
   1801 	LEAQ 16(oup), oup
   1802 
   1803 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   1804 	shiftB0Right; shiftC0Right; shiftD0Right
   1805 	shiftB1Right; shiftC1Right; shiftD1Right
   1806 	shiftB2Right; shiftC2Right; shiftD2Right
   1807 
   1808 	DECQ itr1
   1809 	JG   sealSSETail192LoopA
   1810 
   1811 	DECQ itr2
   1812 	JGE  sealSSETail192LoopB
   1813 
   1814 	PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
   1815 	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
   1816 	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
   1817 	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
   1818 
   1819 	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   1820 	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
   1821 	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
   1822 	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
   1823 	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   1824 	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   1825 
   1826 	MOVO A2, A1
   1827 	MOVO B2, B1
   1828 	MOVO C2, C1
   1829 	MOVO D2, D1
   1830 	MOVQ $128, itr1
   1831 	LEAQ 128(inp), inp
   1832 	SUBQ $128, inl
   1833 
   1834 	JMP sealSSE128SealHash
   1835 
   1836 // ----------------------------------------------------------------------------
   1837 // Special seal optimization for buffers smaller than 129 bytes
   1838 sealSSE128:
   1839 	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
   1840 	MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
   1841 	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
   1842 	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
   1843 	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
   1844 	MOVQ  $10, itr2
   1845 
   1846 sealSSE128InnerCipherLoop:
   1847 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   1848 	shiftB0Left;  shiftB1Left; shiftB2Left
   1849 	shiftC0Left;  shiftC1Left; shiftC2Left
   1850 	shiftD0Left;  shiftD1Left; shiftD2Left
   1851 	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   1852 	shiftB0Right; shiftB1Right; shiftB2Right
   1853 	shiftC0Right; shiftC1Right; shiftC2Right
   1854 	shiftD0Right; shiftD1Right; shiftD2Right
   1855 	DECQ          itr2
   1856 	JNE           sealSSE128InnerCipherLoop
   1857 
   1858 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   1859 	PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
   1860 	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
   1861 	PADDL T2, C1; PADDL T2, C2
   1862 	PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2
   1863 	PAND  polyClampMask<>(SB), A0
   1864 	MOVOU A0, rStore
   1865 	MOVOU B0, sStore
   1866 
   1867 	// Hash
   1868 	MOVQ ad_len+80(FP), itr2
   1869 	CALL polyHashADInternal<>(SB)
   1870 	XORQ itr1, itr1
   1871 
   1872 sealSSE128SealHash:
   1873 	// itr1 holds the number of bytes encrypted but not yet hashed
   1874 	CMPQ itr1, $16
   1875 	JB   sealSSE128Seal
   1876 	polyAdd(0(oup))
   1877 	polyMul
   1878 
   1879 	SUBQ $16, itr1
   1880 	ADDQ $16, oup
   1881 
   1882 	JMP sealSSE128SealHash
   1883 
   1884 sealSSE128Seal:
   1885 	CMPQ inl, $16
   1886 	JB   sealSSETail
   1887 	SUBQ $16, inl
   1888 
   1889 	// Load for decryption
   1890 	MOVOU (inp), T0
   1891 	PXOR  T0, A1
   1892 	MOVOU A1, (oup)
   1893 	LEAQ  (1*16)(inp), inp
   1894 	LEAQ  (1*16)(oup), oup
   1895 
   1896 	// Extract for hashing
   1897 	MOVQ   A1, t0
   1898 	PSRLDQ $8, A1
   1899 	MOVQ A1, t1
   1900 	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   1901 	polyMul
   1902 
   1903 	// Shift the stream "left"
   1904 	MOVO B1, A1
   1905 	MOVO C1, B1
   1906 	MOVO D1, C1
   1907 	MOVO A2, D1
   1908 	MOVO B2, A2
   1909 	MOVO C2, B2
   1910 	MOVO D2, C2
   1911 	JMP  sealSSE128Seal
   1912 
   1913 sealSSETail:
   1914 	TESTQ inl, inl
   1915 	JE    sealSSEFinalize
   1916 
   1917 	// We can only load the PT one byte at a time to avoid read after end of buffer
   1918 	MOVQ inl, itr2
   1919 	SHLQ $4, itr2
   1920 	LEAQ andMask<>(SB), t0
   1921 	MOVQ inl, itr1
   1922 	LEAQ -1(inp)(inl*1), inp
   1923 	XORQ t2, t2
   1924 	XORQ t3, t3
   1925 	XORQ AX, AX
   1926 
   1927 sealSSETailLoadLoop:
   1928 	SHLQ $8, t2, t3
   1929 	SHLQ $8, t2
   1930 	MOVB (inp), AX
   1931 	XORQ AX, t2
   1932 	LEAQ   -1(inp), inp
   1933 	DECQ   itr1
   1934 	JNE    sealSSETailLoadLoop
   1935 	MOVQ t2, 0+tmpStore
   1936 	MOVQ t3, 8+tmpStore
   1937 	PXOR 0+tmpStore, A1
   1938 	MOVOU  A1, (oup)
   1939 	MOVOU  -16(t0)(itr2*1), T0
   1940 	PAND   T0, A1
   1941 	MOVQ   A1, t0
   1942 	PSRLDQ $8, A1
   1943 	MOVQ   A1, t1
   1944 	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   1945 	polyMul
   1946 
   1947 	ADDQ inl, oup
   1948 
   1949 sealSSEFinalize:
   1950 	// Hash in the buffer lengths
   1951 	ADDQ ad_len+80(FP), acc0
   1952 	ADCQ src_len+56(FP), acc1
   1953 	ADCQ $1, acc2
   1954 	polyMul
   1955 
   1956 	// Final reduce
   1957 	MOVQ    acc0, t0
   1958 	MOVQ    acc1, t1
   1959 	MOVQ    acc2, t2
   1960 	SUBQ    $-5, acc0
   1961 	SBBQ    $-1, acc1
   1962 	SBBQ    $3, acc2
   1963 	CMOVQCS t0, acc0
   1964 	CMOVQCS t1, acc1
   1965 	CMOVQCS t2, acc2
   1966 
   1967 	// Add in the "s" part of the key
   1968 	ADDQ 0+sStore, acc0
   1969 	ADCQ 8+sStore, acc1
   1970 
   1971 	// Finally store the tag at the end of the message
   1972 	MOVQ acc0, (0*8)(oup)
   1973 	MOVQ acc1, (1*8)(oup)
   1974 	RET
   1975 
   1976 // ----------------------------------------------------------------------------
   1977 // ------------------------- AVX2 Code ----------------------------------------
   1978 chacha20Poly1305Seal_AVX2:
   1979 	VZEROUPPER
   1980 	VMOVDQU chacha20Constants<>(SB), AA0
   1981 	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
   1982 	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
   1983 	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
   1984 	VPADDD  avx2InitMask<>(SB), DD0, DD0
   1985 
   1986 	// Special optimizations, for very short buffers
   1987 	CMPQ inl, $192
   1988 	JBE  seal192AVX2 // 33% faster
   1989 	CMPQ inl, $320
   1990 	JBE  seal320AVX2 // 17% faster
   1991 
   1992 	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
   1993 	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   1994 	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
   1995 	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
   1996 	VPADDD  avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
   1997 	VPADDD  avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
   1998 	VPADDD  avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
   1999 	VMOVDQA DD3, ctr3StoreAVX2
   2000 	MOVQ    $10, itr2
   2001 
   2002 sealAVX2IntroLoop:
   2003 	VMOVDQA CC3, tmpStoreAVX2
   2004 	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
   2005 	VMOVDQA tmpStoreAVX2, CC3
   2006 	VMOVDQA CC1, tmpStoreAVX2
   2007 	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
   2008 	VMOVDQA tmpStoreAVX2, CC1
   2009 
   2010 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   2011 	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
   2012 	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
   2013 	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
   2014 
   2015 	VMOVDQA CC3, tmpStoreAVX2
   2016 	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
   2017 	VMOVDQA tmpStoreAVX2, CC3
   2018 	VMOVDQA CC1, tmpStoreAVX2
   2019 	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
   2020 	VMOVDQA tmpStoreAVX2, CC1
   2021 
   2022 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   2023 	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
   2024 	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
   2025 	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
   2026 	DECQ     itr2
   2027 	JNE      sealAVX2IntroLoop
   2028 
   2029 	VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
   2030 	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   2031 	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   2032 	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   2033 
   2034 	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
   2035 	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
   2036 	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
   2037 
   2038 	// Clamp and store poly key
   2039 	VPAND   polyClampMask<>(SB), DD0, DD0
   2040 	VMOVDQA DD0, rsStoreAVX2
   2041 
   2042 	// Hash AD
   2043 	MOVQ ad_len+80(FP), itr2
   2044 	CALL polyHashADInternal<>(SB)
   2045 
   2046 	// Can store at least 320 bytes
   2047 	VPXOR   (0*32)(inp), AA0, AA0
   2048 	VPXOR   (1*32)(inp), CC0, CC0
   2049 	VMOVDQU AA0, (0*32)(oup)
   2050 	VMOVDQU CC0, (1*32)(oup)
   2051 
   2052 	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   2053 	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
   2054 	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
   2055 	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   2056 	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
   2057 	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
   2058 
   2059 	MOVQ $320, itr1
   2060 	SUBQ $320, inl
   2061 	LEAQ 320(inp), inp
   2062 
   2063 	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
   2064 	CMPQ       inl, $128
   2065 	JBE        sealAVX2SealHash
   2066 
   2067 	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
   2068 	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
   2069 	SUBQ    $128, inl
   2070 	LEAQ    128(inp), inp
   2071 
   2072 	MOVQ $8, itr1
   2073 	MOVQ $2, itr2
   2074 
   2075 	CMPQ inl, $128
   2076 	JBE  sealAVX2Tail128
   2077 	CMPQ inl, $256
   2078 	JBE  sealAVX2Tail256
   2079 	CMPQ inl, $384
   2080 	JBE  sealAVX2Tail384
   2081 	CMPQ inl, $512
   2082 	JBE  sealAVX2Tail512
   2083 
   2084 	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
   2085 	VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   2086 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   2087 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   2088 	VMOVDQA ctr3StoreAVX2, DD0
   2089 	VPADDD  avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
   2090 	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   2091 
   2092 	VMOVDQA CC3, tmpStoreAVX2
   2093 	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
   2094 	VMOVDQA tmpStoreAVX2, CC3
   2095 	VMOVDQA CC1, tmpStoreAVX2
   2096 	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
   2097 	VMOVDQA tmpStoreAVX2, CC1
   2098 
   2099 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   2100 	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
   2101 	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
   2102 	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
   2103 
   2104 	VMOVDQA CC3, tmpStoreAVX2
   2105 	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
   2106 	VMOVDQA tmpStoreAVX2, CC3
   2107 	VMOVDQA CC1, tmpStoreAVX2
   2108 	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
   2109 	VMOVDQA tmpStoreAVX2, CC1
   2110 
   2111 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   2112 	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
   2113 	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
   2114 	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
   2115 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2116 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2117 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   2118 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2119 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2120 	VMOVDQA  CC3, tmpStoreAVX2
   2121 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   2122 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   2123 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   2124 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   2125 	VMOVDQA  tmpStoreAVX2, CC3
   2126 
   2127 	SUBQ $16, oup                  // Adjust the pointer
   2128 	MOVQ $9, itr1
   2129 	JMP  sealAVX2InternalLoopStart
   2130 
   2131 sealAVX2MainLoop:
   2132 	// Load state, increment counter blocks, store the incremented counters
   2133 	VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   2134 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   2135 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   2136 	VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
   2137 	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   2138 	MOVQ    $10, itr1
   2139 
   2140 sealAVX2InternalLoop:
   2141 	polyAdd(0*8(oup))
   2142 	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2143 	polyMulStage1_AVX2
   2144 	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2145 	VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   2146 	polyMulStage2_AVX2
   2147 	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2148 	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2149 	polyMulStage3_AVX2
   2150 	VMOVDQA CC3, tmpStoreAVX2
   2151 	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   2152 	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   2153 	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   2154 	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   2155 	VMOVDQA tmpStoreAVX2, CC3
   2156 	polyMulReduceStage
   2157 
   2158 sealAVX2InternalLoopStart:
   2159 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2160 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2161 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
   2162 	polyAdd(2*8(oup))
   2163 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2164 	polyMulStage1_AVX2
   2165 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2166 	VMOVDQA  CC3, tmpStoreAVX2
   2167 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   2168 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   2169 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   2170 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   2171 	VMOVDQA  tmpStoreAVX2, CC3
   2172 	polyMulStage2_AVX2
   2173 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   2174 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   2175 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   2176 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2177 	polyMulStage3_AVX2
   2178 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2179 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   2180 	polyMulReduceStage
   2181 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2182 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2183 	polyAdd(4*8(oup))
   2184 	LEAQ     (6*8)(oup), oup
   2185 	VMOVDQA  CC3, tmpStoreAVX2
   2186 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   2187 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   2188 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   2189 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   2190 	VMOVDQA  tmpStoreAVX2, CC3
   2191 	polyMulStage1_AVX2
   2192 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2193 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2194 	polyMulStage2_AVX2
   2195 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
   2196 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2197 	polyMulStage3_AVX2
   2198 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2199 	VMOVDQA  CC3, tmpStoreAVX2
   2200 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   2201 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   2202 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   2203 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   2204 	VMOVDQA  tmpStoreAVX2, CC3
   2205 	polyMulReduceStage
   2206 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   2207 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   2208 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   2209 	DECQ     itr1
   2210 	JNE      sealAVX2InternalLoop
   2211 
   2212 	VPADDD  chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
   2213 	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   2214 	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   2215 	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   2216 	VMOVDQA CC3, tmpStoreAVX2
   2217 
   2218 	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
   2219 	polyAdd(0*8(oup))
   2220 	polyMulAVX2
   2221 	LEAQ       (4*8)(oup), oup
   2222 	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   2223 	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   2224 	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   2225 	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   2226 	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   2227 	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   2228 
   2229 	// and here
   2230 	polyAdd(-2*8(oup))
   2231 	polyMulAVX2
   2232 	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   2233 	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   2234 	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   2235 	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   2236 	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
   2237 	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
   2238 	LEAQ       (32*16)(inp), inp
   2239 	SUBQ       $(32*16), inl
   2240 	CMPQ       inl, $512
   2241 	JG         sealAVX2MainLoop
   2242 
   2243 	// Tail can only hash 480 bytes
   2244 	polyAdd(0*8(oup))
   2245 	polyMulAVX2
   2246 	polyAdd(2*8(oup))
   2247 	polyMulAVX2
   2248 	LEAQ 32(oup), oup
   2249 
   2250 	MOVQ $10, itr1
   2251 	MOVQ $0, itr2
   2252 	CMPQ inl, $128
   2253 	JBE  sealAVX2Tail128
   2254 	CMPQ inl, $256
   2255 	JBE  sealAVX2Tail256
   2256 	CMPQ inl, $384
   2257 	JBE  sealAVX2Tail384
   2258 	JMP  sealAVX2Tail512
   2259 
   2260 // ----------------------------------------------------------------------------
   2261 // Special optimization for buffers smaller than 193 bytes
   2262 seal192AVX2:
   2263 	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
   2264 	VMOVDQA AA0, AA1
   2265 	VMOVDQA BB0, BB1
   2266 	VMOVDQA CC0, CC1
   2267 	VPADDD  avx2IncMask<>(SB), DD0, DD1
   2268 	VMOVDQA AA0, AA2
   2269 	VMOVDQA BB0, BB2
   2270 	VMOVDQA CC0, CC2
   2271 	VMOVDQA DD0, DD2
   2272 	VMOVDQA DD1, TT3
   2273 	MOVQ    $10, itr2
   2274 
   2275 sealAVX2192InnerCipherLoop:
   2276 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   2277 	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   2278 	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   2279 	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   2280 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   2281 	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   2282 	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   2283 	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   2284 	DECQ       itr2
   2285 	JNE        sealAVX2192InnerCipherLoop
   2286 	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
   2287 	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
   2288 	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
   2289 	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
   2290 	VPERM2I128 $0x02, AA0, BB0, TT0
   2291 
   2292 	// Clamp and store poly key
   2293 	VPAND   polyClampMask<>(SB), TT0, TT0
   2294 	VMOVDQA TT0, rsStoreAVX2
   2295 
   2296 	// Stream for up to 192 bytes
   2297 	VPERM2I128 $0x13, AA0, BB0, AA0
   2298 	VPERM2I128 $0x13, CC0, DD0, BB0
   2299 	VPERM2I128 $0x02, AA1, BB1, CC0
   2300 	VPERM2I128 $0x02, CC1, DD1, DD0
   2301 	VPERM2I128 $0x13, AA1, BB1, AA1
   2302 	VPERM2I128 $0x13, CC1, DD1, BB1
   2303 
   2304 sealAVX2ShortSeal:
   2305 	// Hash aad
   2306 	MOVQ ad_len+80(FP), itr2
   2307 	CALL polyHashADInternal<>(SB)
   2308 	XORQ itr1, itr1
   2309 
   2310 sealAVX2SealHash:
   2311 	// itr1 holds the number of bytes encrypted but not yet hashed
   2312 	CMPQ itr1, $16
   2313 	JB   sealAVX2ShortSealLoop
   2314 	polyAdd(0(oup))
   2315 	polyMul
   2316 	SUBQ $16, itr1
   2317 	ADDQ $16, oup
   2318 	JMP  sealAVX2SealHash
   2319 
   2320 sealAVX2ShortSealLoop:
   2321 	CMPQ inl, $32
   2322 	JB   sealAVX2ShortTail32
   2323 	SUBQ $32, inl
   2324 
   2325 	// Load for encryption
   2326 	VPXOR   (inp), AA0, AA0
   2327 	VMOVDQU AA0, (oup)
   2328 	LEAQ    (1*32)(inp), inp
   2329 
   2330 	// Now can hash
   2331 	polyAdd(0*8(oup))
   2332 	polyMulAVX2
   2333 	polyAdd(2*8(oup))
   2334 	polyMulAVX2
   2335 	LEAQ (1*32)(oup), oup
   2336 
   2337 	// Shift stream left
   2338 	VMOVDQA BB0, AA0
   2339 	VMOVDQA CC0, BB0
   2340 	VMOVDQA DD0, CC0
   2341 	VMOVDQA AA1, DD0
   2342 	VMOVDQA BB1, AA1
   2343 	VMOVDQA CC1, BB1
   2344 	VMOVDQA DD1, CC1
   2345 	VMOVDQA AA2, DD1
   2346 	VMOVDQA BB2, AA2
   2347 	JMP     sealAVX2ShortSealLoop
   2348 
   2349 sealAVX2ShortTail32:
   2350 	CMPQ    inl, $16
   2351 	VMOVDQA A0, A1
   2352 	JB      sealAVX2ShortDone
   2353 
   2354 	SUBQ $16, inl
   2355 
   2356 	// Load for encryption
   2357 	VPXOR   (inp), A0, T0
   2358 	VMOVDQU T0, (oup)
   2359 	LEAQ    (1*16)(inp), inp
   2360 
   2361 	// Hash
   2362 	polyAdd(0*8(oup))
   2363 	polyMulAVX2
   2364 	LEAQ       (1*16)(oup), oup
   2365 	VPERM2I128 $0x11, AA0, AA0, AA0
   2366 	VMOVDQA    A0, A1
   2367 
   2368 sealAVX2ShortDone:
   2369 	VZEROUPPER
   2370 	JMP sealSSETail
   2371 
   2372 // ----------------------------------------------------------------------------
   2373 // Special optimization for buffers smaller than 321 bytes
   2374 seal320AVX2:
   2375 	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
   2376 	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1
   2377 	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2
   2378 	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
   2379 	MOVQ    $10, itr2
   2380 
   2381 sealAVX2320InnerCipherLoop:
   2382 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   2383 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
   2384 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   2385 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
   2386 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   2387 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
   2388 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   2389 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
   2390 	DECQ     itr2
   2391 	JNE      sealAVX2320InnerCipherLoop
   2392 
   2393 	VMOVDQA chacha20Constants<>(SB), TT0
   2394 	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
   2395 	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
   2396 	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
   2397 	VMOVDQA avx2IncMask<>(SB), TT0
   2398 	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
   2399 	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
   2400 	VPADDD  TT3, DD2, DD2
   2401 
   2402 	// Clamp and store poly key
   2403 	VPERM2I128 $0x02, AA0, BB0, TT0
   2404 	VPAND      polyClampMask<>(SB), TT0, TT0
   2405 	VMOVDQA    TT0, rsStoreAVX2
   2406 
   2407 	// Stream for up to 320 bytes
   2408 	VPERM2I128 $0x13, AA0, BB0, AA0
   2409 	VPERM2I128 $0x13, CC0, DD0, BB0
   2410 	VPERM2I128 $0x02, AA1, BB1, CC0
   2411 	VPERM2I128 $0x02, CC1, DD1, DD0
   2412 	VPERM2I128 $0x13, AA1, BB1, AA1
   2413 	VPERM2I128 $0x13, CC1, DD1, BB1
   2414 	VPERM2I128 $0x02, AA2, BB2, CC1
   2415 	VPERM2I128 $0x02, CC2, DD2, DD1
   2416 	VPERM2I128 $0x13, AA2, BB2, AA2
   2417 	VPERM2I128 $0x13, CC2, DD2, BB2
   2418 	JMP        sealAVX2ShortSeal
   2419 
   2420 // ----------------------------------------------------------------------------
   2421 // Special optimization for the last 128 bytes of ciphertext
   2422 sealAVX2Tail128:
   2423 	// Need to decrypt up to 128 bytes - prepare two blocks
   2424 	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
   2425 	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
   2426 	VMOVDQA chacha20Constants<>(SB), AA0
   2427 	VMOVDQA state1StoreAVX2, BB0
   2428 	VMOVDQA state2StoreAVX2, CC0
   2429 	VMOVDQA ctr3StoreAVX2, DD0
   2430 	VPADDD  avx2IncMask<>(SB), DD0, DD0
   2431 	VMOVDQA DD0, DD1
   2432 
   2433 sealAVX2Tail128LoopA:
   2434 	polyAdd(0(oup))
   2435 	polyMul
   2436 	LEAQ 16(oup), oup
   2437 
   2438 sealAVX2Tail128LoopB:
   2439 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   2440 	polyAdd(0(oup))
   2441 	polyMul
   2442 	VPALIGNR $4, BB0, BB0, BB0
   2443 	VPALIGNR $8, CC0, CC0, CC0
   2444 	VPALIGNR $12, DD0, DD0, DD0
   2445 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   2446 	polyAdd(16(oup))
   2447 	polyMul
   2448 	LEAQ     32(oup), oup
   2449 	VPALIGNR $12, BB0, BB0, BB0
   2450 	VPALIGNR $8, CC0, CC0, CC0
   2451 	VPALIGNR $4, DD0, DD0, DD0
   2452 	DECQ     itr1
   2453 	JG       sealAVX2Tail128LoopA
   2454 	DECQ     itr2
   2455 	JGE      sealAVX2Tail128LoopB
   2456 
   2457 	VPADDD chacha20Constants<>(SB), AA0, AA1
   2458 	VPADDD state1StoreAVX2, BB0, BB1
   2459 	VPADDD state2StoreAVX2, CC0, CC1
   2460 	VPADDD DD1, DD0, DD1
   2461 
   2462 	VPERM2I128 $0x02, AA1, BB1, AA0
   2463 	VPERM2I128 $0x02, CC1, DD1, BB0
   2464 	VPERM2I128 $0x13, AA1, BB1, CC0
   2465 	VPERM2I128 $0x13, CC1, DD1, DD0
   2466 	JMP        sealAVX2ShortSealLoop
   2467 
   2468 // ----------------------------------------------------------------------------
   2469 // Special optimization for the last 256 bytes of ciphertext
   2470 sealAVX2Tail256:
   2471 	// Need to decrypt up to 256 bytes - prepare two blocks
   2472 	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
   2473 	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
   2474 	VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA chacha20Constants<>(SB), AA1
   2475 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
   2476 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
   2477 	VMOVDQA ctr3StoreAVX2, DD0
   2478 	VPADDD  avx2IncMask<>(SB), DD0, DD0
   2479 	VPADDD  avx2IncMask<>(SB), DD0, DD1
   2480 	VMOVDQA DD0, TT1
   2481 	VMOVDQA DD1, TT2
   2482 
   2483 sealAVX2Tail256LoopA:
   2484 	polyAdd(0(oup))
   2485 	polyMul
   2486 	LEAQ 16(oup), oup
   2487 
   2488 sealAVX2Tail256LoopB:
   2489 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   2490 	polyAdd(0(oup))
   2491 	polyMul
   2492 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   2493 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   2494 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   2495 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   2496 	polyAdd(16(oup))
   2497 	polyMul
   2498 	LEAQ     32(oup), oup
   2499 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   2500 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   2501 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   2502 	DECQ     itr1
   2503 	JG       sealAVX2Tail256LoopA
   2504 	DECQ     itr2
   2505 	JGE      sealAVX2Tail256LoopB
   2506 
   2507 	VPADDD     chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1
   2508 	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
   2509 	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
   2510 	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
   2511 	VPERM2I128 $0x02, AA0, BB0, TT0
   2512 	VPERM2I128 $0x02, CC0, DD0, TT1
   2513 	VPERM2I128 $0x13, AA0, BB0, TT2
   2514 	VPERM2I128 $0x13, CC0, DD0, TT3
   2515 	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
   2516 	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
   2517 	MOVQ       $128, itr1
   2518 	LEAQ       128(inp), inp
   2519 	SUBQ       $128, inl
   2520 	VPERM2I128 $0x02, AA1, BB1, AA0
   2521 	VPERM2I128 $0x02, CC1, DD1, BB0
   2522 	VPERM2I128 $0x13, AA1, BB1, CC0
   2523 	VPERM2I128 $0x13, CC1, DD1, DD0
   2524 
   2525 	JMP sealAVX2SealHash
   2526 
   2527 // ----------------------------------------------------------------------------
   2528 // Special optimization for the last 384 bytes of ciphertext
   2529 sealAVX2Tail384:
   2530 	// Need to decrypt up to 384 bytes - prepare two blocks
   2531 	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
   2532 	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
   2533 	VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
   2534 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
   2535 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
   2536 	VMOVDQA ctr3StoreAVX2, DD0
   2537 	VPADDD  avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2
   2538 	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
   2539 
   2540 sealAVX2Tail384LoopA:
   2541 	polyAdd(0(oup))
   2542 	polyMul
   2543 	LEAQ 16(oup), oup
   2544 
   2545 sealAVX2Tail384LoopB:
   2546 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   2547 	polyAdd(0(oup))
   2548 	polyMul
   2549 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
   2550 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   2551 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
   2552 	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
   2553 	polyAdd(16(oup))
   2554 	polyMul
   2555 	LEAQ     32(oup), oup
   2556 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
   2557 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
   2558 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
   2559 	DECQ     itr1
   2560 	JG       sealAVX2Tail384LoopA
   2561 	DECQ     itr2
   2562 	JGE      sealAVX2Tail384LoopB
   2563 
   2564 	VPADDD     chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2
   2565 	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
   2566 	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
   2567 	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
   2568 	VPERM2I128 $0x02, AA0, BB0, TT0
   2569 	VPERM2I128 $0x02, CC0, DD0, TT1
   2570 	VPERM2I128 $0x13, AA0, BB0, TT2
   2571 	VPERM2I128 $0x13, CC0, DD0, TT3
   2572 	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
   2573 	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
   2574 	VPERM2I128 $0x02, AA1, BB1, TT0
   2575 	VPERM2I128 $0x02, CC1, DD1, TT1
   2576 	VPERM2I128 $0x13, AA1, BB1, TT2
   2577 	VPERM2I128 $0x13, CC1, DD1, TT3
   2578 	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
   2579 	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
   2580 	MOVQ       $256, itr1
   2581 	LEAQ       256(inp), inp
   2582 	SUBQ       $256, inl
   2583 	VPERM2I128 $0x02, AA2, BB2, AA0
   2584 	VPERM2I128 $0x02, CC2, DD2, BB0
   2585 	VPERM2I128 $0x13, AA2, BB2, CC0
   2586 	VPERM2I128 $0x13, CC2, DD2, DD0
   2587 
   2588 	JMP sealAVX2SealHash
   2589 
   2590 // ----------------------------------------------------------------------------
   2591 // Special optimization for the last 512 bytes of ciphertext
   2592 sealAVX2Tail512:
   2593 	// Need to decrypt up to 512 bytes - prepare two blocks
   2594 	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
   2595 	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
   2596 	VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   2597 	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   2598 	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   2599 	VMOVDQA ctr3StoreAVX2, DD0
   2600 	VPADDD  avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
   2601 	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   2602 
   2603 sealAVX2Tail512LoopA:
   2604 	polyAdd(0(oup))
   2605 	polyMul
   2606 	LEAQ 16(oup), oup
   2607 
   2608 sealAVX2Tail512LoopB:
   2609 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2610 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2611 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   2612 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2613 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2614 	VMOVDQA  CC3, tmpStoreAVX2
   2615 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   2616 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   2617 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   2618 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   2619 	VMOVDQA  tmpStoreAVX2, CC3
   2620 	polyAdd(0*8(oup))
   2621 	polyMulAVX2
   2622 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2623 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2624 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
   2625 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2626 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2627 	VMOVDQA  CC3, tmpStoreAVX2
   2628 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   2629 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   2630 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   2631 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   2632 	VMOVDQA  tmpStoreAVX2, CC3
   2633 	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   2634 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   2635 	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   2636 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2637 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2638 	VPSHUFB  rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
   2639 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2640 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2641 	polyAdd(2*8(oup))
   2642 	polyMulAVX2
   2643 	LEAQ     (4*8)(oup), oup
   2644 	VMOVDQA  CC3, tmpStoreAVX2
   2645 	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   2646 	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   2647 	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   2648 	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   2649 	VMOVDQA  tmpStoreAVX2, CC3
   2650 	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   2651 	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   2652 	VPSHUFB  rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
   2653 	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   2654 	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   2655 	VMOVDQA  CC3, tmpStoreAVX2
   2656 	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   2657 	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   2658 	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   2659 	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   2660 	VMOVDQA  tmpStoreAVX2, CC3
   2661 	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   2662 	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   2663 	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   2664 
   2665 	DECQ itr1
   2666 	JG   sealAVX2Tail512LoopA
   2667 	DECQ itr2
   2668 	JGE  sealAVX2Tail512LoopB
   2669 
   2670 	VPADDD     chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
   2671 	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   2672 	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   2673 	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   2674 	VMOVDQA    CC3, tmpStoreAVX2
   2675 	VPERM2I128 $0x02, AA0, BB0, CC3
   2676 	VPXOR      (0*32)(inp), CC3, CC3
   2677 	VMOVDQU    CC3, (0*32)(oup)
   2678 	VPERM2I128 $0x02, CC0, DD0, CC3
   2679 	VPXOR      (1*32)(inp), CC3, CC3
   2680 	VMOVDQU    CC3, (1*32)(oup)
   2681 	VPERM2I128 $0x13, AA0, BB0, CC3
   2682 	VPXOR      (2*32)(inp), CC3, CC3
   2683 	VMOVDQU    CC3, (2*32)(oup)
   2684 	VPERM2I128 $0x13, CC0, DD0, CC3
   2685 	VPXOR      (3*32)(inp), CC3, CC3
   2686 	VMOVDQU    CC3, (3*32)(oup)
   2687 
   2688 	VPERM2I128 $0x02, AA1, BB1, AA0
   2689 	VPERM2I128 $0x02, CC1, DD1, BB0
   2690 	VPERM2I128 $0x13, AA1, BB1, CC0
   2691 	VPERM2I128 $0x13, CC1, DD1, DD0
   2692 	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   2693 	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   2694 
   2695 	VPERM2I128 $0x02, AA2, BB2, AA0
   2696 	VPERM2I128 $0x02, CC2, DD2, BB0
   2697 	VPERM2I128 $0x13, AA2, BB2, CC0
   2698 	VPERM2I128 $0x13, CC2, DD2, DD0
   2699 	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   2700 	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   2701 
   2702 	MOVQ       $384, itr1
   2703 	LEAQ       384(inp), inp
   2704 	SUBQ       $384, inl
   2705 	VPERM2I128 $0x02, AA3, BB3, AA0
   2706 	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
   2707 	VPERM2I128 $0x13, AA3, BB3, CC0
   2708 	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   2709 
   2710 	JMP sealAVX2SealHash
   2711 
   2712 // func haveSSSE3() bool
   2713 TEXT haveSSSE3(SB), NOSPLIT, $0
   2714 	XORQ AX, AX
   2715 	INCL AX
   2716 	CPUID
   2717 	SHRQ $9, CX
   2718 	ANDQ $1, CX
   2719 	MOVB CX, ret+0(FP)
   2720 	RET
   2721 
   2722