1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. 6 7 // +build go1.7,amd64,!gccgo,!appengine 8 9 #include "textflag.h" 10 // General register allocation 11 #define oup DI 12 #define inp SI 13 #define inl BX 14 #define adp CX // free to reuse, after we hash the additional data 15 #define keyp R8 // free to reuse, when we copy the key to stack 16 #define itr2 R9 // general iterator 17 #define itr1 CX // general iterator 18 #define acc0 R10 19 #define acc1 R11 20 #define acc2 R12 21 #define t0 R13 22 #define t1 R14 23 #define t2 R15 24 #define t3 R8 25 // Register and stack allocation for the SSE code 26 #define rStore (0*16)(BP) 27 #define sStore (1*16)(BP) 28 #define state1Store (2*16)(BP) 29 #define state2Store (3*16)(BP) 30 #define tmpStore (4*16)(BP) 31 #define ctr0Store (5*16)(BP) 32 #define ctr1Store (6*16)(BP) 33 #define ctr2Store (7*16)(BP) 34 #define ctr3Store (8*16)(BP) 35 #define A0 X0 36 #define A1 X1 37 #define A2 X2 38 #define B0 X3 39 #define B1 X4 40 #define B2 X5 41 #define C0 X6 42 #define C1 X7 43 #define C2 X8 44 #define D0 X9 45 #define D1 X10 46 #define D2 X11 47 #define T0 X12 48 #define T1 X13 49 #define T2 X14 50 #define T3 X15 51 #define A3 T0 52 #define B3 T1 53 #define C3 T2 54 #define D3 T3 55 // Register and stack allocation for the AVX2 code 56 #define rsStoreAVX2 (0*32)(BP) 57 #define state1StoreAVX2 (1*32)(BP) 58 #define state2StoreAVX2 (2*32)(BP) 59 #define ctr0StoreAVX2 (3*32)(BP) 60 #define ctr1StoreAVX2 (4*32)(BP) 61 #define ctr2StoreAVX2 (5*32)(BP) 62 #define ctr3StoreAVX2 (6*32)(BP) 63 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack 64 #define AA0 Y0 65 #define AA1 Y5 66 #define AA2 Y6 67 #define AA3 Y7 68 #define BB0 Y14 69 #define BB1 Y9 70 #define BB2 Y10 71 #define BB3 Y11 72 #define CC0 Y12 73 #define CC1 Y13 74 #define CC2 Y8 75 #define CC3 Y15 76 #define DD0 Y4 77 #define DD1 Y1 78 #define DD2 Y2 79 #define DD3 Y3 80 #define TT0 DD3 81 #define TT1 AA3 82 #define TT2 BB3 83 #define TT3 CC3 84 // ChaCha20 constants 85 DATA chacha20Constants<>+0x00(SB)/4, $0x61707865 86 DATA chacha20Constants<>+0x04(SB)/4, $0x3320646e 87 DATA chacha20Constants<>+0x08(SB)/4, $0x79622d32 88 DATA chacha20Constants<>+0x0c(SB)/4, $0x6b206574 89 DATA chacha20Constants<>+0x10(SB)/4, $0x61707865 90 DATA chacha20Constants<>+0x14(SB)/4, $0x3320646e 91 DATA chacha20Constants<>+0x18(SB)/4, $0x79622d32 92 DATA chacha20Constants<>+0x1c(SB)/4, $0x6b206574 93 // <<< 16 with PSHUFB 94 DATA rol16<>+0x00(SB)/8, $0x0504070601000302 95 DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 96 DATA rol16<>+0x10(SB)/8, $0x0504070601000302 97 DATA rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A 98 // <<< 8 with PSHUFB 99 DATA rol8<>+0x00(SB)/8, $0x0605040702010003 100 DATA rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 101 DATA rol8<>+0x10(SB)/8, $0x0605040702010003 102 DATA rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 103 104 DATA avx2InitMask<>+0x00(SB)/8, $0x0 105 DATA avx2InitMask<>+0x08(SB)/8, $0x0 106 DATA avx2InitMask<>+0x10(SB)/8, $0x1 107 DATA avx2InitMask<>+0x18(SB)/8, $0x0 108 109 DATA avx2IncMask<>+0x00(SB)/8, $0x2 110 DATA avx2IncMask<>+0x08(SB)/8, $0x0 111 DATA avx2IncMask<>+0x10(SB)/8, $0x2 112 DATA avx2IncMask<>+0x18(SB)/8, $0x0 113 // Poly1305 key clamp 114 DATA polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF 115 DATA polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC 116 DATA polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 117 DATA polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 118 119 DATA sseIncMask<>+0x00(SB)/8, $0x1 120 DATA sseIncMask<>+0x08(SB)/8, $0x0 121 // To load/store the last < 16 bytes in a buffer 122 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 123 DATA andMask<>+0x08(SB)/8, $0x0000000000000000 124 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 125 DATA andMask<>+0x18(SB)/8, $0x0000000000000000 126 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 127 DATA andMask<>+0x28(SB)/8, $0x0000000000000000 128 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 129 DATA andMask<>+0x38(SB)/8, $0x0000000000000000 130 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 131 DATA andMask<>+0x48(SB)/8, $0x0000000000000000 132 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 133 DATA andMask<>+0x58(SB)/8, $0x0000000000000000 134 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 135 DATA andMask<>+0x68(SB)/8, $0x0000000000000000 136 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 137 DATA andMask<>+0x78(SB)/8, $0x0000000000000000 138 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 139 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 140 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 141 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 142 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 143 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 144 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 145 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 146 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 147 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 148 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 149 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 150 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 151 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 152 153 GLOBL chacha20Constants<>(SB), (NOPTR+RODATA), $32 154 GLOBL rol16<>(SB), (NOPTR+RODATA), $32 155 GLOBL rol8<>(SB), (NOPTR+RODATA), $32 156 GLOBL sseIncMask<>(SB), (NOPTR+RODATA), $16 157 GLOBL avx2IncMask<>(SB), (NOPTR+RODATA), $32 158 GLOBL avx2InitMask<>(SB), (NOPTR+RODATA), $32 159 GLOBL polyClampMask<>(SB), (NOPTR+RODATA), $32 160 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 161 // No PALIGNR in Go ASM yet (but VPALIGNR is present). 162 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 163 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 164 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 165 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 166 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 167 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 168 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 169 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 170 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 171 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 172 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 173 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 174 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 175 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 176 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 177 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 178 #define shiftC0Right shiftC0Left 179 #define shiftC1Right shiftC1Left 180 #define shiftC2Right shiftC2Left 181 #define shiftC3Right shiftC3Left 182 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 183 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 184 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 185 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 186 // Some macros 187 #define chachaQR(A, B, C, D, T) \ 188 PADDD B, A; PXOR A, D; PSHUFB rol16<>(SB), D \ 189 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ 190 PADDD B, A; PXOR A, D; PSHUFB rol8<>(SB), D \ 191 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B 192 193 #define chachaQR_AVX2(A, B, C, D, T) \ 194 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol16<>(SB), D, D \ 195 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ 196 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol8<>(SB), D, D \ 197 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B 198 199 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 200 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 201 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX 202 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 203 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 204 205 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 206 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 207 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 208 209 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage 210 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage 211 // ---------------------------------------------------------------------------- 212 TEXT polyHashADInternal<>(SB), NOSPLIT, $0 213 // adp points to beginning of additional data 214 // itr2 holds ad length 215 XORQ acc0, acc0 216 XORQ acc1, acc1 217 XORQ acc2, acc2 218 CMPQ itr2, $13 219 JNE hashADLoop 220 221 openFastTLSAD: 222 // Special treatment for the TLS case of 13 bytes 223 MOVQ (adp), acc0 224 MOVQ 5(adp), acc1 225 SHRQ $24, acc1 226 MOVQ $1, acc2 227 polyMul 228 RET 229 230 hashADLoop: 231 // Hash in 16 byte chunks 232 CMPQ itr2, $16 233 JB hashADTail 234 polyAdd(0(adp)) 235 LEAQ (1*16)(adp), adp 236 SUBQ $16, itr2 237 polyMul 238 JMP hashADLoop 239 240 hashADTail: 241 CMPQ itr2, $0 242 JE hashADDone 243 244 // Hash last < 16 byte tail 245 XORQ t0, t0 246 XORQ t1, t1 247 XORQ t2, t2 248 ADDQ itr2, adp 249 250 hashADTailLoop: 251 SHLQ $8, t1:t0 252 SHLQ $8, t0 253 MOVB -1(adp), t2 254 XORQ t2, t0 255 DECQ adp 256 DECQ itr2 257 JNE hashADTailLoop 258 259 hashADTailFinish: 260 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 261 polyMul 262 263 // Finished AD 264 hashADDone: 265 RET 266 267 // ---------------------------------------------------------------------------- 268 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool 269 TEXT chacha20Poly1305Open(SB), 0, $288-97 270 // For aligned stack access 271 MOVQ SP, BP 272 ADDQ $32, BP 273 ANDQ $-32, BP 274 MOVQ dst+0(FP), oup 275 MOVQ key+24(FP), keyp 276 MOVQ src+48(FP), inp 277 MOVQ src_len+56(FP), inl 278 MOVQ ad+72(FP), adp 279 280 // Check for AVX2 support 281 CMPB runtimesupport_avx2(SB), $0 282 JE noavx2bmi2Open 283 284 // Check BMI2 bit for MULXQ. 285 // runtimecpuid_ebx7 is always available here 286 // because it passed avx2 check 287 TESTL $(1<<8), runtimecpuid_ebx7(SB) 288 JNE chacha20Poly1305Open_AVX2 289 noavx2bmi2Open: 290 291 // Special optimization, for very short buffers 292 CMPQ inl, $128 293 JBE openSSE128 // About 16% faster 294 295 // For long buffers, prepare the poly key first 296 MOVOU chacha20Constants<>(SB), A0 297 MOVOU (1*16)(keyp), B0 298 MOVOU (2*16)(keyp), C0 299 MOVOU (3*16)(keyp), D0 300 MOVO D0, T1 301 302 // Store state on stack for future use 303 MOVO B0, state1Store 304 MOVO C0, state2Store 305 MOVO D0, ctr3Store 306 MOVQ $10, itr2 307 308 openSSEPreparePolyKey: 309 chachaQR(A0, B0, C0, D0, T0) 310 shiftB0Left; shiftC0Left; shiftD0Left 311 chachaQR(A0, B0, C0, D0, T0) 312 shiftB0Right; shiftC0Right; shiftD0Right 313 DECQ itr2 314 JNE openSSEPreparePolyKey 315 316 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 317 PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0 318 319 // Clamp and store the key 320 PAND polyClampMask<>(SB), A0 321 MOVO A0, rStore; MOVO B0, sStore 322 323 // Hash AAD 324 MOVQ ad_len+80(FP), itr2 325 CALL polyHashADInternal<>(SB) 326 327 openSSEMainLoop: 328 CMPQ inl, $256 329 JB openSSEMainLoopDone 330 331 // Load state, increment counter blocks 332 MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 333 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 334 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 335 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 336 337 // Store counters 338 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 339 340 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 341 MOVQ $4, itr1 342 MOVQ inp, itr2 343 344 openSSEInternalLoop: 345 MOVO C3, tmpStore 346 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 347 MOVO tmpStore, C3 348 MOVO C1, tmpStore 349 chachaQR(A3, B3, C3, D3, C1) 350 MOVO tmpStore, C1 351 polyAdd(0(itr2)) 352 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 353 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 354 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 355 polyMulStage1 356 polyMulStage2 357 LEAQ (2*8)(itr2), itr2 358 MOVO C3, tmpStore 359 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 360 MOVO tmpStore, C3 361 MOVO C1, tmpStore 362 polyMulStage3 363 chachaQR(A3, B3, C3, D3, C1) 364 MOVO tmpStore, C1 365 polyMulReduceStage 366 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 367 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 368 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 369 DECQ itr1 370 JGE openSSEInternalLoop 371 372 polyAdd(0(itr2)) 373 polyMul 374 LEAQ (2*8)(itr2), itr2 375 376 CMPQ itr1, $-6 377 JG openSSEInternalLoop 378 379 // Add in the state 380 PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 381 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 382 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 383 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 384 385 // Load - xor - store 386 MOVO D3, tmpStore 387 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) 388 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) 389 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) 390 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) 391 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) 392 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) 393 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) 394 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) 395 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) 396 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) 397 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) 398 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) 399 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) 400 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) 401 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) 402 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) 403 LEAQ 256(inp), inp 404 LEAQ 256(oup), oup 405 SUBQ $256, inl 406 JMP openSSEMainLoop 407 408 openSSEMainLoopDone: 409 // Handle the various tail sizes efficiently 410 TESTQ inl, inl 411 JE openSSEFinalize 412 CMPQ inl, $64 413 JBE openSSETail64 414 CMPQ inl, $128 415 JBE openSSETail128 416 CMPQ inl, $192 417 JBE openSSETail192 418 JMP openSSETail256 419 420 openSSEFinalize: 421 // Hash in the PT, AAD lengths 422 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 423 polyMul 424 425 // Final reduce 426 MOVQ acc0, t0 427 MOVQ acc1, t1 428 MOVQ acc2, t2 429 SUBQ $-5, acc0 430 SBBQ $-1, acc1 431 SBBQ $3, acc2 432 CMOVQCS t0, acc0 433 CMOVQCS t1, acc1 434 CMOVQCS t2, acc2 435 436 // Add in the "s" part of the key 437 ADDQ 0+sStore, acc0 438 ADCQ 8+sStore, acc1 439 440 // Finally, constant time compare to the tag at the end of the message 441 XORQ AX, AX 442 MOVQ $1, DX 443 XORQ (0*8)(inp), acc0 444 XORQ (1*8)(inp), acc1 445 ORQ acc1, acc0 446 CMOVQEQ DX, AX 447 448 // Return true iff tags are equal 449 MOVB AX, ret+96(FP) 450 RET 451 452 // ---------------------------------------------------------------------------- 453 // Special optimization for buffers smaller than 129 bytes 454 openSSE128: 455 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 456 MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 457 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 458 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 459 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 460 MOVQ $10, itr2 461 462 openSSE128InnerCipherLoop: 463 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 464 shiftB0Left; shiftB1Left; shiftB2Left 465 shiftC0Left; shiftC1Left; shiftC2Left 466 shiftD0Left; shiftD1Left; shiftD2Left 467 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 468 shiftB0Right; shiftB1Right; shiftB2Right 469 shiftC0Right; shiftC1Right; shiftC2Right 470 shiftD0Right; shiftD1Right; shiftD2Right 471 DECQ itr2 472 JNE openSSE128InnerCipherLoop 473 474 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 475 PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 476 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 477 PADDL T2, C1; PADDL T2, C2 478 PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2 479 480 // Clamp and store the key 481 PAND polyClampMask<>(SB), A0 482 MOVOU A0, rStore; MOVOU B0, sStore 483 484 // Hash 485 MOVQ ad_len+80(FP), itr2 486 CALL polyHashADInternal<>(SB) 487 488 openSSE128Open: 489 CMPQ inl, $16 490 JB openSSETail16 491 SUBQ $16, inl 492 493 // Load for hashing 494 polyAdd(0(inp)) 495 496 // Load for decryption 497 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) 498 LEAQ (1*16)(inp), inp 499 LEAQ (1*16)(oup), oup 500 polyMul 501 502 // Shift the stream "left" 503 MOVO B1, A1 504 MOVO C1, B1 505 MOVO D1, C1 506 MOVO A2, D1 507 MOVO B2, A2 508 MOVO C2, B2 509 MOVO D2, C2 510 JMP openSSE128Open 511 512 openSSETail16: 513 TESTQ inl, inl 514 JE openSSEFinalize 515 516 // We can safely load the CT from the end, because it is padded with the MAC 517 MOVQ inl, itr2 518 SHLQ $4, itr2 519 LEAQ andMask<>(SB), t0 520 MOVOU (inp), T0 521 ADDQ inl, inp 522 PAND -16(t0)(itr2*1), T0 523 MOVO T0, 0+tmpStore 524 MOVQ T0, t0 525 MOVQ 8+tmpStore, t1 526 PXOR A1, T0 527 528 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes 529 openSSETail16Store: 530 MOVQ T0, t3 531 MOVB t3, (oup) 532 PSRLDQ $1, T0 533 INCQ oup 534 DECQ inl 535 JNE openSSETail16Store 536 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 537 polyMul 538 JMP openSSEFinalize 539 540 // ---------------------------------------------------------------------------- 541 // Special optimization for the last 64 bytes of ciphertext 542 openSSETail64: 543 // Need to decrypt up to 64 bytes - prepare single block 544 MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store 545 XORQ itr2, itr2 546 MOVQ inl, itr1 547 CMPQ itr1, $16 548 JB openSSETail64LoopB 549 550 openSSETail64LoopA: 551 // Perform ChaCha rounds, while hashing the remaining input 552 polyAdd(0(inp)(itr2*1)) 553 polyMul 554 SUBQ $16, itr1 555 556 openSSETail64LoopB: 557 ADDQ $16, itr2 558 chachaQR(A0, B0, C0, D0, T0) 559 shiftB0Left; shiftC0Left; shiftD0Left 560 chachaQR(A0, B0, C0, D0, T0) 561 shiftB0Right; shiftC0Right; shiftD0Right 562 563 CMPQ itr1, $16 564 JAE openSSETail64LoopA 565 566 CMPQ itr2, $160 567 JNE openSSETail64LoopB 568 569 PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 570 571 openSSETail64DecLoop: 572 CMPQ inl, $16 573 JB openSSETail64DecLoopDone 574 SUBQ $16, inl 575 MOVOU (inp), T0 576 PXOR T0, A0 577 MOVOU A0, (oup) 578 LEAQ 16(inp), inp 579 LEAQ 16(oup), oup 580 MOVO B0, A0 581 MOVO C0, B0 582 MOVO D0, C0 583 JMP openSSETail64DecLoop 584 585 openSSETail64DecLoopDone: 586 MOVO A0, A1 587 JMP openSSETail16 588 589 // ---------------------------------------------------------------------------- 590 // Special optimization for the last 128 bytes of ciphertext 591 openSSETail128: 592 // Need to decrypt up to 128 bytes - prepare two blocks 593 MOVO chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr0Store 594 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr1Store 595 XORQ itr2, itr2 596 MOVQ inl, itr1 597 ANDQ $-16, itr1 598 599 openSSETail128LoopA: 600 // Perform ChaCha rounds, while hashing the remaining input 601 polyAdd(0(inp)(itr2*1)) 602 polyMul 603 604 openSSETail128LoopB: 605 ADDQ $16, itr2 606 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 607 shiftB0Left; shiftC0Left; shiftD0Left 608 shiftB1Left; shiftC1Left; shiftD1Left 609 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 610 shiftB0Right; shiftC0Right; shiftD0Right 611 shiftB1Right; shiftC1Right; shiftD1Right 612 613 CMPQ itr2, itr1 614 JB openSSETail128LoopA 615 616 CMPQ itr2, $160 617 JNE openSSETail128LoopB 618 619 PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1 620 PADDL state1Store, B0; PADDL state1Store, B1 621 PADDL state2Store, C0; PADDL state2Store, C1 622 PADDL ctr1Store, D0; PADDL ctr0Store, D1 623 624 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 625 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 626 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 627 628 SUBQ $64, inl 629 LEAQ 64(inp), inp 630 LEAQ 64(oup), oup 631 JMP openSSETail64DecLoop 632 633 // ---------------------------------------------------------------------------- 634 // Special optimization for the last 192 bytes of ciphertext 635 openSSETail192: 636 // Need to decrypt up to 192 bytes - prepare three blocks 637 MOVO chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr0Store 638 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store 639 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr2Store 640 641 MOVQ inl, itr1 642 MOVQ $160, itr2 643 CMPQ itr1, $160 644 CMOVQGT itr2, itr1 645 ANDQ $-16, itr1 646 XORQ itr2, itr2 647 648 openSSLTail192LoopA: 649 // Perform ChaCha rounds, while hashing the remaining input 650 polyAdd(0(inp)(itr2*1)) 651 polyMul 652 653 openSSLTail192LoopB: 654 ADDQ $16, itr2 655 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 656 shiftB0Left; shiftC0Left; shiftD0Left 657 shiftB1Left; shiftC1Left; shiftD1Left 658 shiftB2Left; shiftC2Left; shiftD2Left 659 660 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 661 shiftB0Right; shiftC0Right; shiftD0Right 662 shiftB1Right; shiftC1Right; shiftD1Right 663 shiftB2Right; shiftC2Right; shiftD2Right 664 665 CMPQ itr2, itr1 666 JB openSSLTail192LoopA 667 668 CMPQ itr2, $160 669 JNE openSSLTail192LoopB 670 671 CMPQ inl, $176 672 JB openSSLTail192Store 673 674 polyAdd(160(inp)) 675 polyMul 676 677 CMPQ inl, $192 678 JB openSSLTail192Store 679 680 polyAdd(176(inp)) 681 polyMul 682 683 openSSLTail192Store: 684 PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 685 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 686 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 687 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 688 689 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 690 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 691 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) 692 693 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 694 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 695 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 696 697 SUBQ $128, inl 698 LEAQ 128(inp), inp 699 LEAQ 128(oup), oup 700 JMP openSSETail64DecLoop 701 702 // ---------------------------------------------------------------------------- 703 // Special optimization for the last 256 bytes of ciphertext 704 openSSETail256: 705 // Need to decrypt up to 256 bytes - prepare four blocks 706 MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 707 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 708 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 709 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 710 711 // Store counters 712 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 713 XORQ itr2, itr2 714 715 openSSETail256Loop: 716 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication 717 polyAdd(0(inp)(itr2*1)) 718 MOVO C3, tmpStore 719 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 720 MOVO tmpStore, C3 721 MOVO C1, tmpStore 722 chachaQR(A3, B3, C3, D3, C1) 723 MOVO tmpStore, C1 724 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 725 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 726 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 727 polyMulStage1 728 polyMulStage2 729 MOVO C3, tmpStore 730 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 731 MOVO tmpStore, C3 732 MOVO C1, tmpStore 733 chachaQR(A3, B3, C3, D3, C1) 734 MOVO tmpStore, C1 735 polyMulStage3 736 polyMulReduceStage 737 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 738 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 739 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 740 ADDQ $2*8, itr2 741 CMPQ itr2, $160 742 JB openSSETail256Loop 743 MOVQ inl, itr1 744 ANDQ $-16, itr1 745 746 openSSETail256HashLoop: 747 polyAdd(0(inp)(itr2*1)) 748 polyMul 749 ADDQ $2*8, itr2 750 CMPQ itr2, itr1 751 JB openSSETail256HashLoop 752 753 // Add in the state 754 PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 755 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 756 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 757 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 758 MOVO D3, tmpStore 759 760 // Load - xor - store 761 MOVOU (0*16)(inp), D3; PXOR D3, A0 762 MOVOU (1*16)(inp), D3; PXOR D3, B0 763 MOVOU (2*16)(inp), D3; PXOR D3, C0 764 MOVOU (3*16)(inp), D3; PXOR D3, D0 765 MOVOU A0, (0*16)(oup) 766 MOVOU B0, (1*16)(oup) 767 MOVOU C0, (2*16)(oup) 768 MOVOU D0, (3*16)(oup) 769 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 770 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 771 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 772 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 773 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 774 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 775 LEAQ 192(inp), inp 776 LEAQ 192(oup), oup 777 SUBQ $192, inl 778 MOVO A3, A0 779 MOVO B3, B0 780 MOVO C3, C0 781 MOVO tmpStore, D0 782 783 JMP openSSETail64DecLoop 784 785 // ---------------------------------------------------------------------------- 786 // ------------------------- AVX2 Code ---------------------------------------- 787 chacha20Poly1305Open_AVX2: 788 VZEROUPPER 789 VMOVDQU chacha20Constants<>(SB), AA0 790 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 791 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 792 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 793 VPADDD avx2InitMask<>(SB), DD0, DD0 794 795 // Special optimization, for very short buffers 796 CMPQ inl, $192 797 JBE openAVX2192 798 CMPQ inl, $320 799 JBE openAVX2320 800 801 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 802 VMOVDQA BB0, state1StoreAVX2 803 VMOVDQA CC0, state2StoreAVX2 804 VMOVDQA DD0, ctr3StoreAVX2 805 MOVQ $10, itr2 806 807 openAVX2PreparePolyKey: 808 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 809 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 810 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 811 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 812 DECQ itr2 813 JNE openAVX2PreparePolyKey 814 815 VPADDD chacha20Constants<>(SB), AA0, AA0 816 VPADDD state1StoreAVX2, BB0, BB0 817 VPADDD state2StoreAVX2, CC0, CC0 818 VPADDD ctr3StoreAVX2, DD0, DD0 819 820 VPERM2I128 $0x02, AA0, BB0, TT0 821 822 // Clamp and store poly key 823 VPAND polyClampMask<>(SB), TT0, TT0 824 VMOVDQA TT0, rsStoreAVX2 825 826 // Stream for the first 64 bytes 827 VPERM2I128 $0x13, AA0, BB0, AA0 828 VPERM2I128 $0x13, CC0, DD0, BB0 829 830 // Hash AD + first 64 bytes 831 MOVQ ad_len+80(FP), itr2 832 CALL polyHashADInternal<>(SB) 833 XORQ itr1, itr1 834 835 openAVX2InitialHash64: 836 polyAdd(0(inp)(itr1*1)) 837 polyMulAVX2 838 ADDQ $16, itr1 839 CMPQ itr1, $64 840 JNE openAVX2InitialHash64 841 842 // Decrypt the first 64 bytes 843 VPXOR (0*32)(inp), AA0, AA0 844 VPXOR (1*32)(inp), BB0, BB0 845 VMOVDQU AA0, (0*32)(oup) 846 VMOVDQU BB0, (1*32)(oup) 847 LEAQ (2*32)(inp), inp 848 LEAQ (2*32)(oup), oup 849 SUBQ $64, inl 850 851 openAVX2MainLoop: 852 CMPQ inl, $512 853 JB openAVX2MainLoopDone 854 855 // Load state, increment counter blocks, store the incremented counters 856 VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 857 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 858 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 859 VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 860 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 861 XORQ itr1, itr1 862 863 openAVX2InternalLoop: 864 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications 865 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext 866 polyAdd(0*8(inp)(itr1*1)) 867 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 868 polyMulStage1_AVX2 869 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 870 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 871 polyMulStage2_AVX2 872 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 873 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 874 polyMulStage3_AVX2 875 VMOVDQA CC3, tmpStoreAVX2 876 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 877 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 878 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 879 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 880 VMOVDQA tmpStoreAVX2, CC3 881 polyMulReduceStage 882 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 883 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 884 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 885 polyAdd(2*8(inp)(itr1*1)) 886 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 887 polyMulStage1_AVX2 888 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 889 VMOVDQA CC3, tmpStoreAVX2 890 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 891 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 892 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 893 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 894 VMOVDQA tmpStoreAVX2, CC3 895 polyMulStage2_AVX2 896 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 897 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 898 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 899 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 900 polyMulStage3_AVX2 901 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 902 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 903 polyMulReduceStage 904 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 905 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 906 polyAdd(4*8(inp)(itr1*1)) 907 LEAQ (6*8)(itr1), itr1 908 VMOVDQA CC3, tmpStoreAVX2 909 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 910 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 911 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 912 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 913 VMOVDQA tmpStoreAVX2, CC3 914 polyMulStage1_AVX2 915 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 916 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 917 polyMulStage2_AVX2 918 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 919 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 920 polyMulStage3_AVX2 921 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 922 VMOVDQA CC3, tmpStoreAVX2 923 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 924 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 925 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 926 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 927 VMOVDQA tmpStoreAVX2, CC3 928 polyMulReduceStage 929 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 930 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 931 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 932 CMPQ itr1, $480 933 JNE openAVX2InternalLoop 934 935 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 936 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 937 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 938 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 939 VMOVDQA CC3, tmpStoreAVX2 940 941 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 942 polyAdd(480(inp)) 943 polyMulAVX2 944 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 945 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 946 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 947 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 948 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 949 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 950 951 // and here 952 polyAdd(496(inp)) 953 polyMulAVX2 954 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 955 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 956 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 957 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 958 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 959 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 960 LEAQ (32*16)(inp), inp 961 LEAQ (32*16)(oup), oup 962 SUBQ $(32*16), inl 963 JMP openAVX2MainLoop 964 965 openAVX2MainLoopDone: 966 // Handle the various tail sizes efficiently 967 TESTQ inl, inl 968 JE openSSEFinalize 969 CMPQ inl, $128 970 JBE openAVX2Tail128 971 CMPQ inl, $256 972 JBE openAVX2Tail256 973 CMPQ inl, $384 974 JBE openAVX2Tail384 975 JMP openAVX2Tail512 976 977 // ---------------------------------------------------------------------------- 978 // Special optimization for buffers smaller than 193 bytes 979 openAVX2192: 980 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 981 VMOVDQA AA0, AA1 982 VMOVDQA BB0, BB1 983 VMOVDQA CC0, CC1 984 VPADDD avx2IncMask<>(SB), DD0, DD1 985 VMOVDQA AA0, AA2 986 VMOVDQA BB0, BB2 987 VMOVDQA CC0, CC2 988 VMOVDQA DD0, DD2 989 VMOVDQA DD1, TT3 990 MOVQ $10, itr2 991 992 openAVX2192InnerCipherLoop: 993 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 994 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 995 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 996 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 997 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 998 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 999 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1000 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 1001 DECQ itr2 1002 JNE openAVX2192InnerCipherLoop 1003 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 1004 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 1005 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 1006 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 1007 VPERM2I128 $0x02, AA0, BB0, TT0 1008 1009 // Clamp and store poly key 1010 VPAND polyClampMask<>(SB), TT0, TT0 1011 VMOVDQA TT0, rsStoreAVX2 1012 1013 // Stream for up to 192 bytes 1014 VPERM2I128 $0x13, AA0, BB0, AA0 1015 VPERM2I128 $0x13, CC0, DD0, BB0 1016 VPERM2I128 $0x02, AA1, BB1, CC0 1017 VPERM2I128 $0x02, CC1, DD1, DD0 1018 VPERM2I128 $0x13, AA1, BB1, AA1 1019 VPERM2I128 $0x13, CC1, DD1, BB1 1020 1021 openAVX2ShortOpen: 1022 // Hash 1023 MOVQ ad_len+80(FP), itr2 1024 CALL polyHashADInternal<>(SB) 1025 1026 openAVX2ShortOpenLoop: 1027 CMPQ inl, $32 1028 JB openAVX2ShortTail32 1029 SUBQ $32, inl 1030 1031 // Load for hashing 1032 polyAdd(0*8(inp)) 1033 polyMulAVX2 1034 polyAdd(2*8(inp)) 1035 polyMulAVX2 1036 1037 // Load for decryption 1038 VPXOR (inp), AA0, AA0 1039 VMOVDQU AA0, (oup) 1040 LEAQ (1*32)(inp), inp 1041 LEAQ (1*32)(oup), oup 1042 1043 // Shift stream left 1044 VMOVDQA BB0, AA0 1045 VMOVDQA CC0, BB0 1046 VMOVDQA DD0, CC0 1047 VMOVDQA AA1, DD0 1048 VMOVDQA BB1, AA1 1049 VMOVDQA CC1, BB1 1050 VMOVDQA DD1, CC1 1051 VMOVDQA AA2, DD1 1052 VMOVDQA BB2, AA2 1053 JMP openAVX2ShortOpenLoop 1054 1055 openAVX2ShortTail32: 1056 CMPQ inl, $16 1057 VMOVDQA A0, A1 1058 JB openAVX2ShortDone 1059 1060 SUBQ $16, inl 1061 1062 // Load for hashing 1063 polyAdd(0*8(inp)) 1064 polyMulAVX2 1065 1066 // Load for decryption 1067 VPXOR (inp), A0, T0 1068 VMOVDQU T0, (oup) 1069 LEAQ (1*16)(inp), inp 1070 LEAQ (1*16)(oup), oup 1071 VPERM2I128 $0x11, AA0, AA0, AA0 1072 VMOVDQA A0, A1 1073 1074 openAVX2ShortDone: 1075 VZEROUPPER 1076 JMP openSSETail16 1077 1078 // ---------------------------------------------------------------------------- 1079 // Special optimization for buffers smaller than 321 bytes 1080 openAVX2320: 1081 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 1082 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1 1083 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2 1084 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 1085 MOVQ $10, itr2 1086 1087 openAVX2320InnerCipherLoop: 1088 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1089 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1090 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1091 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1092 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1093 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1094 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1095 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1096 DECQ itr2 1097 JNE openAVX2320InnerCipherLoop 1098 1099 VMOVDQA chacha20Constants<>(SB), TT0 1100 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 1101 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 1102 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 1103 VMOVDQA avx2IncMask<>(SB), TT0 1104 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 1105 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 1106 VPADDD TT3, DD2, DD2 1107 1108 // Clamp and store poly key 1109 VPERM2I128 $0x02, AA0, BB0, TT0 1110 VPAND polyClampMask<>(SB), TT0, TT0 1111 VMOVDQA TT0, rsStoreAVX2 1112 1113 // Stream for up to 320 bytes 1114 VPERM2I128 $0x13, AA0, BB0, AA0 1115 VPERM2I128 $0x13, CC0, DD0, BB0 1116 VPERM2I128 $0x02, AA1, BB1, CC0 1117 VPERM2I128 $0x02, CC1, DD1, DD0 1118 VPERM2I128 $0x13, AA1, BB1, AA1 1119 VPERM2I128 $0x13, CC1, DD1, BB1 1120 VPERM2I128 $0x02, AA2, BB2, CC1 1121 VPERM2I128 $0x02, CC2, DD2, DD1 1122 VPERM2I128 $0x13, AA2, BB2, AA2 1123 VPERM2I128 $0x13, CC2, DD2, BB2 1124 JMP openAVX2ShortOpen 1125 1126 // ---------------------------------------------------------------------------- 1127 // Special optimization for the last 128 bytes of ciphertext 1128 openAVX2Tail128: 1129 // Need to decrypt up to 128 bytes - prepare two blocks 1130 VMOVDQA chacha20Constants<>(SB), AA1 1131 VMOVDQA state1StoreAVX2, BB1 1132 VMOVDQA state2StoreAVX2, CC1 1133 VMOVDQA ctr3StoreAVX2, DD1 1134 VPADDD avx2IncMask<>(SB), DD1, DD1 1135 VMOVDQA DD1, DD0 1136 1137 XORQ itr2, itr2 1138 MOVQ inl, itr1 1139 ANDQ $-16, itr1 1140 TESTQ itr1, itr1 1141 JE openAVX2Tail128LoopB 1142 1143 openAVX2Tail128LoopA: 1144 // Perform ChaCha rounds, while hashing the remaining input 1145 polyAdd(0(inp)(itr2*1)) 1146 polyMulAVX2 1147 1148 openAVX2Tail128LoopB: 1149 ADDQ $16, itr2 1150 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1151 VPALIGNR $4, BB1, BB1, BB1 1152 VPALIGNR $8, CC1, CC1, CC1 1153 VPALIGNR $12, DD1, DD1, DD1 1154 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1155 VPALIGNR $12, BB1, BB1, BB1 1156 VPALIGNR $8, CC1, CC1, CC1 1157 VPALIGNR $4, DD1, DD1, DD1 1158 CMPQ itr2, itr1 1159 JB openAVX2Tail128LoopA 1160 CMPQ itr2, $160 1161 JNE openAVX2Tail128LoopB 1162 1163 VPADDD chacha20Constants<>(SB), AA1, AA1 1164 VPADDD state1StoreAVX2, BB1, BB1 1165 VPADDD state2StoreAVX2, CC1, CC1 1166 VPADDD DD0, DD1, DD1 1167 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1168 1169 openAVX2TailLoop: 1170 CMPQ inl, $32 1171 JB openAVX2Tail 1172 SUBQ $32, inl 1173 1174 // Load for decryption 1175 VPXOR (inp), AA0, AA0 1176 VMOVDQU AA0, (oup) 1177 LEAQ (1*32)(inp), inp 1178 LEAQ (1*32)(oup), oup 1179 VMOVDQA BB0, AA0 1180 VMOVDQA CC0, BB0 1181 VMOVDQA DD0, CC0 1182 JMP openAVX2TailLoop 1183 1184 openAVX2Tail: 1185 CMPQ inl, $16 1186 VMOVDQA A0, A1 1187 JB openAVX2TailDone 1188 SUBQ $16, inl 1189 1190 // Load for decryption 1191 VPXOR (inp), A0, T0 1192 VMOVDQU T0, (oup) 1193 LEAQ (1*16)(inp), inp 1194 LEAQ (1*16)(oup), oup 1195 VPERM2I128 $0x11, AA0, AA0, AA0 1196 VMOVDQA A0, A1 1197 1198 openAVX2TailDone: 1199 VZEROUPPER 1200 JMP openSSETail16 1201 1202 // ---------------------------------------------------------------------------- 1203 // Special optimization for the last 256 bytes of ciphertext 1204 openAVX2Tail256: 1205 // Need to decrypt up to 256 bytes - prepare four blocks 1206 VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 1207 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 1208 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 1209 VMOVDQA ctr3StoreAVX2, DD0 1210 VPADDD avx2IncMask<>(SB), DD0, DD0 1211 VPADDD avx2IncMask<>(SB), DD0, DD1 1212 VMOVDQA DD0, TT1 1213 VMOVDQA DD1, TT2 1214 1215 // Compute the number of iterations that will hash data 1216 MOVQ inl, tmpStoreAVX2 1217 MOVQ inl, itr1 1218 SUBQ $128, itr1 1219 SHRQ $4, itr1 1220 MOVQ $10, itr2 1221 CMPQ itr1, $10 1222 CMOVQGT itr2, itr1 1223 MOVQ inp, inl 1224 XORQ itr2, itr2 1225 1226 openAVX2Tail256LoopA: 1227 polyAdd(0(inl)) 1228 polyMulAVX2 1229 LEAQ 16(inl), inl 1230 1231 // Perform ChaCha rounds, while hashing the remaining input 1232 openAVX2Tail256LoopB: 1233 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1234 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 1235 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1236 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 1237 INCQ itr2 1238 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1239 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 1240 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1241 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 1242 CMPQ itr2, itr1 1243 JB openAVX2Tail256LoopA 1244 1245 CMPQ itr2, $10 1246 JNE openAVX2Tail256LoopB 1247 1248 MOVQ inl, itr2 1249 SUBQ inp, inl 1250 MOVQ inl, itr1 1251 MOVQ tmpStoreAVX2, inl 1252 1253 // Hash the remainder of data (if any) 1254 openAVX2Tail256Hash: 1255 ADDQ $16, itr1 1256 CMPQ itr1, inl 1257 JGT openAVX2Tail256HashEnd 1258 polyAdd (0(itr2)) 1259 polyMulAVX2 1260 LEAQ 16(itr2), itr2 1261 JMP openAVX2Tail256Hash 1262 1263 // Store 128 bytes safely, then go to store loop 1264 openAVX2Tail256HashEnd: 1265 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1 1266 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 1267 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 1268 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 1269 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 1270 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1271 1272 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 1273 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) 1274 LEAQ (4*32)(inp), inp 1275 LEAQ (4*32)(oup), oup 1276 SUBQ $4*32, inl 1277 1278 JMP openAVX2TailLoop 1279 1280 // ---------------------------------------------------------------------------- 1281 // Special optimization for the last 384 bytes of ciphertext 1282 openAVX2Tail384: 1283 // Need to decrypt up to 384 bytes - prepare six blocks 1284 VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 1285 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 1286 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 1287 VMOVDQA ctr3StoreAVX2, DD0 1288 VPADDD avx2IncMask<>(SB), DD0, DD0 1289 VPADDD avx2IncMask<>(SB), DD0, DD1 1290 VPADDD avx2IncMask<>(SB), DD1, DD2 1291 VMOVDQA DD0, ctr0StoreAVX2 1292 VMOVDQA DD1, ctr1StoreAVX2 1293 VMOVDQA DD2, ctr2StoreAVX2 1294 1295 // Compute the number of iterations that will hash two blocks of data 1296 MOVQ inl, tmpStoreAVX2 1297 MOVQ inl, itr1 1298 SUBQ $256, itr1 1299 SHRQ $4, itr1 1300 ADDQ $6, itr1 1301 MOVQ $10, itr2 1302 CMPQ itr1, $10 1303 CMOVQGT itr2, itr1 1304 MOVQ inp, inl 1305 XORQ itr2, itr2 1306 1307 // Perform ChaCha rounds, while hashing the remaining input 1308 openAVX2Tail384LoopB: 1309 polyAdd(0(inl)) 1310 polyMulAVX2 1311 LEAQ 16(inl), inl 1312 1313 openAVX2Tail384LoopA: 1314 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1315 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1316 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1317 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1318 polyAdd(0(inl)) 1319 polyMulAVX2 1320 LEAQ 16(inl), inl 1321 INCQ itr2 1322 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1323 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1324 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1325 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1326 1327 CMPQ itr2, itr1 1328 JB openAVX2Tail384LoopB 1329 1330 CMPQ itr2, $10 1331 JNE openAVX2Tail384LoopA 1332 1333 MOVQ inl, itr2 1334 SUBQ inp, inl 1335 MOVQ inl, itr1 1336 MOVQ tmpStoreAVX2, inl 1337 1338 openAVX2Tail384Hash: 1339 ADDQ $16, itr1 1340 CMPQ itr1, inl 1341 JGT openAVX2Tail384HashEnd 1342 polyAdd(0(itr2)) 1343 polyMulAVX2 1344 LEAQ 16(itr2), itr2 1345 JMP openAVX2Tail384Hash 1346 1347 // Store 256 bytes safely, then go to store loop 1348 openAVX2Tail384HashEnd: 1349 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2 1350 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 1351 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 1352 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 1353 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 1354 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 1355 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 1356 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 1357 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 1358 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 1359 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1360 LEAQ (8*32)(inp), inp 1361 LEAQ (8*32)(oup), oup 1362 SUBQ $8*32, inl 1363 JMP openAVX2TailLoop 1364 1365 // ---------------------------------------------------------------------------- 1366 // Special optimization for the last 512 bytes of ciphertext 1367 openAVX2Tail512: 1368 VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1369 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 1370 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 1371 VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 1372 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 1373 XORQ itr1, itr1 1374 MOVQ inp, itr2 1375 1376 openAVX2Tail512LoopB: 1377 polyAdd(0(itr2)) 1378 polyMulAVX2 1379 LEAQ (2*8)(itr2), itr2 1380 1381 openAVX2Tail512LoopA: 1382 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1383 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1384 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 1385 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1386 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1387 VMOVDQA CC3, tmpStoreAVX2 1388 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1389 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1390 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1391 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1392 VMOVDQA tmpStoreAVX2, CC3 1393 polyAdd(0*8(itr2)) 1394 polyMulAVX2 1395 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1396 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1397 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 1398 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1399 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1400 VMOVDQA CC3, tmpStoreAVX2 1401 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1402 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1403 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1404 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1405 VMOVDQA tmpStoreAVX2, CC3 1406 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 1407 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1408 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 1409 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1410 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1411 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 1412 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1413 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1414 polyAdd(2*8(itr2)) 1415 polyMulAVX2 1416 LEAQ (4*8)(itr2), itr2 1417 VMOVDQA CC3, tmpStoreAVX2 1418 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1419 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1420 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1421 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1422 VMOVDQA tmpStoreAVX2, CC3 1423 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1424 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1425 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 1426 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1427 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1428 VMOVDQA CC3, tmpStoreAVX2 1429 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1430 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1431 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1432 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1433 VMOVDQA tmpStoreAVX2, CC3 1434 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 1435 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1436 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 1437 INCQ itr1 1438 CMPQ itr1, $4 1439 JLT openAVX2Tail512LoopB 1440 1441 CMPQ itr1, $10 1442 JNE openAVX2Tail512LoopA 1443 1444 MOVQ inl, itr1 1445 SUBQ $384, itr1 1446 ANDQ $-16, itr1 1447 1448 openAVX2Tail512HashLoop: 1449 TESTQ itr1, itr1 1450 JE openAVX2Tail512HashEnd 1451 polyAdd(0(itr2)) 1452 polyMulAVX2 1453 LEAQ 16(itr2), itr2 1454 SUBQ $16, itr1 1455 JMP openAVX2Tail512HashLoop 1456 1457 openAVX2Tail512HashEnd: 1458 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 1459 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 1460 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 1461 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 1462 VMOVDQA CC3, tmpStoreAVX2 1463 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 1464 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 1465 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 1466 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1467 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 1468 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 1469 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1470 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 1471 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 1472 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 1473 1474 LEAQ (12*32)(inp), inp 1475 LEAQ (12*32)(oup), oup 1476 SUBQ $12*32, inl 1477 1478 JMP openAVX2TailLoop 1479 1480 // ---------------------------------------------------------------------------- 1481 // ---------------------------------------------------------------------------- 1482 // func chacha20Poly1305Seal(dst, key, src, ad []byte) 1483 TEXT chacha20Poly1305Seal(SB), 0, $288-96 1484 // For aligned stack access 1485 MOVQ SP, BP 1486 ADDQ $32, BP 1487 ANDQ $-32, BP 1488 MOVQ dst+0(FP), oup 1489 MOVQ key+24(FP), keyp 1490 MOVQ src+48(FP), inp 1491 MOVQ src_len+56(FP), inl 1492 MOVQ ad+72(FP), adp 1493 1494 // Check for AVX2 support 1495 CMPB runtimesupport_avx2(SB), $0 1496 JE noavx2bmi2Seal 1497 1498 // Check BMI2 bit for MULXQ. 1499 // runtimecpuid_ebx7 is always available here 1500 // because it passed avx2 check 1501 TESTL $(1<<8), runtimecpuid_ebx7(SB) 1502 JNE chacha20Poly1305Seal_AVX2 1503 noavx2bmi2Seal: 1504 1505 // Special optimization, for very short buffers 1506 CMPQ inl, $128 1507 JBE sealSSE128 // About 15% faster 1508 1509 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration 1510 MOVOU chacha20Constants<>(SB), A0 1511 MOVOU (1*16)(keyp), B0 1512 MOVOU (2*16)(keyp), C0 1513 MOVOU (3*16)(keyp), D0 1514 1515 // Store state on stack for future use 1516 MOVO B0, state1Store 1517 MOVO C0, state2Store 1518 1519 // Load state, increment counter blocks 1520 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 1521 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 1522 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 1523 1524 // Store counters 1525 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1526 MOVQ $10, itr2 1527 1528 sealSSEIntroLoop: 1529 MOVO C3, tmpStore 1530 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1531 MOVO tmpStore, C3 1532 MOVO C1, tmpStore 1533 chachaQR(A3, B3, C3, D3, C1) 1534 MOVO tmpStore, C1 1535 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1536 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1537 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1538 1539 MOVO C3, tmpStore 1540 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1541 MOVO tmpStore, C3 1542 MOVO C1, tmpStore 1543 chachaQR(A3, B3, C3, D3, C1) 1544 MOVO tmpStore, C1 1545 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1546 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1547 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1548 DECQ itr2 1549 JNE sealSSEIntroLoop 1550 1551 // Add in the state 1552 PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 1553 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1554 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1555 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1556 1557 // Clamp and store the key 1558 PAND polyClampMask<>(SB), A0 1559 MOVO A0, rStore 1560 MOVO B0, sStore 1561 1562 // Hash AAD 1563 MOVQ ad_len+80(FP), itr2 1564 CALL polyHashADInternal<>(SB) 1565 1566 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1567 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1568 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 1569 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1570 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1571 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) 1572 1573 MOVQ $128, itr1 1574 SUBQ $128, inl 1575 LEAQ 128(inp), inp 1576 1577 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 1578 1579 CMPQ inl, $64 1580 JBE sealSSE128SealHash 1581 1582 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1583 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1584 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) 1585 1586 ADDQ $64, itr1 1587 SUBQ $64, inl 1588 LEAQ 64(inp), inp 1589 1590 MOVQ $2, itr1 1591 MOVQ $8, itr2 1592 1593 CMPQ inl, $64 1594 JBE sealSSETail64 1595 CMPQ inl, $128 1596 JBE sealSSETail128 1597 CMPQ inl, $192 1598 JBE sealSSETail192 1599 1600 sealSSEMainLoop: 1601 // Load state, increment counter blocks 1602 MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 1603 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 1604 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 1605 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 1606 1607 // Store counters 1608 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1609 1610 sealSSEInnerLoop: 1611 MOVO C3, tmpStore 1612 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1613 MOVO tmpStore, C3 1614 MOVO C1, tmpStore 1615 chachaQR(A3, B3, C3, D3, C1) 1616 MOVO tmpStore, C1 1617 polyAdd(0(oup)) 1618 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1619 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1620 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1621 polyMulStage1 1622 polyMulStage2 1623 LEAQ (2*8)(oup), oup 1624 MOVO C3, tmpStore 1625 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1626 MOVO tmpStore, C3 1627 MOVO C1, tmpStore 1628 polyMulStage3 1629 chachaQR(A3, B3, C3, D3, C1) 1630 MOVO tmpStore, C1 1631 polyMulReduceStage 1632 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1633 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1634 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1635 DECQ itr2 1636 JGE sealSSEInnerLoop 1637 polyAdd(0(oup)) 1638 polyMul 1639 LEAQ (2*8)(oup), oup 1640 DECQ itr1 1641 JG sealSSEInnerLoop 1642 1643 // Add in the state 1644 PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 1645 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1646 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1647 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1648 MOVO D3, tmpStore 1649 1650 // Load - xor - store 1651 MOVOU (0*16)(inp), D3; PXOR D3, A0 1652 MOVOU (1*16)(inp), D3; PXOR D3, B0 1653 MOVOU (2*16)(inp), D3; PXOR D3, C0 1654 MOVOU (3*16)(inp), D3; PXOR D3, D0 1655 MOVOU A0, (0*16)(oup) 1656 MOVOU B0, (1*16)(oup) 1657 MOVOU C0, (2*16)(oup) 1658 MOVOU D0, (3*16)(oup) 1659 MOVO tmpStore, D3 1660 1661 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1662 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1663 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1664 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 1665 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1666 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 1667 ADDQ $192, inp 1668 MOVQ $192, itr1 1669 SUBQ $192, inl 1670 MOVO A3, A1 1671 MOVO B3, B1 1672 MOVO C3, C1 1673 MOVO D3, D1 1674 CMPQ inl, $64 1675 JBE sealSSE128SealHash 1676 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1677 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1678 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) 1679 LEAQ 64(inp), inp 1680 SUBQ $64, inl 1681 MOVQ $6, itr1 1682 MOVQ $4, itr2 1683 CMPQ inl, $192 1684 JG sealSSEMainLoop 1685 1686 MOVQ inl, itr1 1687 TESTQ inl, inl 1688 JE sealSSE128SealHash 1689 MOVQ $6, itr1 1690 CMPQ inl, $64 1691 JBE sealSSETail64 1692 CMPQ inl, $128 1693 JBE sealSSETail128 1694 JMP sealSSETail192 1695 1696 // ---------------------------------------------------------------------------- 1697 // Special optimization for the last 64 bytes of plaintext 1698 sealSSETail64: 1699 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes 1700 MOVO chacha20Constants<>(SB), A1 1701 MOVO state1Store, B1 1702 MOVO state2Store, C1 1703 MOVO ctr3Store, D1 1704 PADDL sseIncMask<>(SB), D1 1705 MOVO D1, ctr0Store 1706 1707 sealSSETail64LoopA: 1708 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1709 polyAdd(0(oup)) 1710 polyMul 1711 LEAQ 16(oup), oup 1712 1713 sealSSETail64LoopB: 1714 chachaQR(A1, B1, C1, D1, T1) 1715 shiftB1Left; shiftC1Left; shiftD1Left 1716 chachaQR(A1, B1, C1, D1, T1) 1717 shiftB1Right; shiftC1Right; shiftD1Right 1718 polyAdd(0(oup)) 1719 polyMul 1720 LEAQ 16(oup), oup 1721 1722 DECQ itr1 1723 JG sealSSETail64LoopA 1724 1725 DECQ itr2 1726 JGE sealSSETail64LoopB 1727 PADDL chacha20Constants<>(SB), A1 1728 PADDL state1Store, B1 1729 PADDL state2Store, C1 1730 PADDL ctr0Store, D1 1731 1732 JMP sealSSE128Seal 1733 1734 // ---------------------------------------------------------------------------- 1735 // Special optimization for the last 128 bytes of plaintext 1736 sealSSETail128: 1737 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes 1738 MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1739 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1740 1741 sealSSETail128LoopA: 1742 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1743 polyAdd(0(oup)) 1744 polyMul 1745 LEAQ 16(oup), oup 1746 1747 sealSSETail128LoopB: 1748 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1749 shiftB0Left; shiftC0Left; shiftD0Left 1750 shiftB1Left; shiftC1Left; shiftD1Left 1751 polyAdd(0(oup)) 1752 polyMul 1753 LEAQ 16(oup), oup 1754 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1755 shiftB0Right; shiftC0Right; shiftD0Right 1756 shiftB1Right; shiftC1Right; shiftD1Right 1757 1758 DECQ itr1 1759 JG sealSSETail128LoopA 1760 1761 DECQ itr2 1762 JGE sealSSETail128LoopB 1763 1764 PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1 1765 PADDL state1Store, B0; PADDL state1Store, B1 1766 PADDL state2Store, C0; PADDL state2Store, C1 1767 PADDL ctr0Store, D0; PADDL ctr1Store, D1 1768 1769 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1770 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1771 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1772 1773 MOVQ $64, itr1 1774 LEAQ 64(inp), inp 1775 SUBQ $64, inl 1776 1777 JMP sealSSE128SealHash 1778 1779 // ---------------------------------------------------------------------------- 1780 // Special optimization for the last 192 bytes of plaintext 1781 sealSSETail192: 1782 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes 1783 MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1784 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1785 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr2Store 1786 1787 sealSSETail192LoopA: 1788 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1789 polyAdd(0(oup)) 1790 polyMul 1791 LEAQ 16(oup), oup 1792 1793 sealSSETail192LoopB: 1794 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1795 shiftB0Left; shiftC0Left; shiftD0Left 1796 shiftB1Left; shiftC1Left; shiftD1Left 1797 shiftB2Left; shiftC2Left; shiftD2Left 1798 1799 polyAdd(0(oup)) 1800 polyMul 1801 LEAQ 16(oup), oup 1802 1803 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1804 shiftB0Right; shiftC0Right; shiftD0Right 1805 shiftB1Right; shiftC1Right; shiftD1Right 1806 shiftB2Right; shiftC2Right; shiftD2Right 1807 1808 DECQ itr1 1809 JG sealSSETail192LoopA 1810 1811 DECQ itr2 1812 JGE sealSSETail192LoopB 1813 1814 PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 1815 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 1816 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 1817 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 1818 1819 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1820 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1821 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1822 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 1823 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 1824 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1825 1826 MOVO A2, A1 1827 MOVO B2, B1 1828 MOVO C2, C1 1829 MOVO D2, D1 1830 MOVQ $128, itr1 1831 LEAQ 128(inp), inp 1832 SUBQ $128, inl 1833 1834 JMP sealSSE128SealHash 1835 1836 // ---------------------------------------------------------------------------- 1837 // Special seal optimization for buffers smaller than 129 bytes 1838 sealSSE128: 1839 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 1840 MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 1841 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 1842 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 1843 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 1844 MOVQ $10, itr2 1845 1846 sealSSE128InnerCipherLoop: 1847 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1848 shiftB0Left; shiftB1Left; shiftB2Left 1849 shiftC0Left; shiftC1Left; shiftC2Left 1850 shiftD0Left; shiftD1Left; shiftD2Left 1851 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1852 shiftB0Right; shiftB1Right; shiftB2Right 1853 shiftC0Right; shiftC1Right; shiftC2Right 1854 shiftD0Right; shiftD1Right; shiftD2Right 1855 DECQ itr2 1856 JNE sealSSE128InnerCipherLoop 1857 1858 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 1859 PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 1860 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 1861 PADDL T2, C1; PADDL T2, C2 1862 PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2 1863 PAND polyClampMask<>(SB), A0 1864 MOVOU A0, rStore 1865 MOVOU B0, sStore 1866 1867 // Hash 1868 MOVQ ad_len+80(FP), itr2 1869 CALL polyHashADInternal<>(SB) 1870 XORQ itr1, itr1 1871 1872 sealSSE128SealHash: 1873 // itr1 holds the number of bytes encrypted but not yet hashed 1874 CMPQ itr1, $16 1875 JB sealSSE128Seal 1876 polyAdd(0(oup)) 1877 polyMul 1878 1879 SUBQ $16, itr1 1880 ADDQ $16, oup 1881 1882 JMP sealSSE128SealHash 1883 1884 sealSSE128Seal: 1885 CMPQ inl, $16 1886 JB sealSSETail 1887 SUBQ $16, inl 1888 1889 // Load for decryption 1890 MOVOU (inp), T0 1891 PXOR T0, A1 1892 MOVOU A1, (oup) 1893 LEAQ (1*16)(inp), inp 1894 LEAQ (1*16)(oup), oup 1895 1896 // Extract for hashing 1897 MOVQ A1, t0 1898 PSRLDQ $8, A1 1899 MOVQ A1, t1 1900 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1901 polyMul 1902 1903 // Shift the stream "left" 1904 MOVO B1, A1 1905 MOVO C1, B1 1906 MOVO D1, C1 1907 MOVO A2, D1 1908 MOVO B2, A2 1909 MOVO C2, B2 1910 MOVO D2, C2 1911 JMP sealSSE128Seal 1912 1913 sealSSETail: 1914 TESTQ inl, inl 1915 JE sealSSEFinalize 1916 1917 // We can only load the PT one byte at a time to avoid read after end of buffer 1918 MOVQ inl, itr2 1919 SHLQ $4, itr2 1920 LEAQ andMask<>(SB), t0 1921 MOVQ inl, itr1 1922 LEAQ -1(inp)(inl*1), inp 1923 XORQ t2, t2 1924 XORQ t3, t3 1925 XORQ AX, AX 1926 1927 sealSSETailLoadLoop: 1928 SHLQ $8, t2, t3 1929 SHLQ $8, t2 1930 MOVB (inp), AX 1931 XORQ AX, t2 1932 LEAQ -1(inp), inp 1933 DECQ itr1 1934 JNE sealSSETailLoadLoop 1935 MOVQ t2, 0+tmpStore 1936 MOVQ t3, 8+tmpStore 1937 PXOR 0+tmpStore, A1 1938 MOVOU A1, (oup) 1939 MOVOU -16(t0)(itr2*1), T0 1940 PAND T0, A1 1941 MOVQ A1, t0 1942 PSRLDQ $8, A1 1943 MOVQ A1, t1 1944 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1945 polyMul 1946 1947 ADDQ inl, oup 1948 1949 sealSSEFinalize: 1950 // Hash in the buffer lengths 1951 ADDQ ad_len+80(FP), acc0 1952 ADCQ src_len+56(FP), acc1 1953 ADCQ $1, acc2 1954 polyMul 1955 1956 // Final reduce 1957 MOVQ acc0, t0 1958 MOVQ acc1, t1 1959 MOVQ acc2, t2 1960 SUBQ $-5, acc0 1961 SBBQ $-1, acc1 1962 SBBQ $3, acc2 1963 CMOVQCS t0, acc0 1964 CMOVQCS t1, acc1 1965 CMOVQCS t2, acc2 1966 1967 // Add in the "s" part of the key 1968 ADDQ 0+sStore, acc0 1969 ADCQ 8+sStore, acc1 1970 1971 // Finally store the tag at the end of the message 1972 MOVQ acc0, (0*8)(oup) 1973 MOVQ acc1, (1*8)(oup) 1974 RET 1975 1976 // ---------------------------------------------------------------------------- 1977 // ------------------------- AVX2 Code ---------------------------------------- 1978 chacha20Poly1305Seal_AVX2: 1979 VZEROUPPER 1980 VMOVDQU chacha20Constants<>(SB), AA0 1981 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 1982 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 1983 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 1984 VPADDD avx2InitMask<>(SB), DD0, DD0 1985 1986 // Special optimizations, for very short buffers 1987 CMPQ inl, $192 1988 JBE seal192AVX2 // 33% faster 1989 CMPQ inl, $320 1990 JBE seal320AVX2 // 17% faster 1991 1992 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 1993 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1994 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 1995 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 1996 VPADDD avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 1997 VPADDD avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 1998 VPADDD avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 1999 VMOVDQA DD3, ctr3StoreAVX2 2000 MOVQ $10, itr2 2001 2002 sealAVX2IntroLoop: 2003 VMOVDQA CC3, tmpStoreAVX2 2004 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2005 VMOVDQA tmpStoreAVX2, CC3 2006 VMOVDQA CC1, tmpStoreAVX2 2007 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2008 VMOVDQA tmpStoreAVX2, CC1 2009 2010 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 2011 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 2012 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 2013 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2014 2015 VMOVDQA CC3, tmpStoreAVX2 2016 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2017 VMOVDQA tmpStoreAVX2, CC3 2018 VMOVDQA CC1, tmpStoreAVX2 2019 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2020 VMOVDQA tmpStoreAVX2, CC1 2021 2022 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2023 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2024 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2025 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2026 DECQ itr2 2027 JNE sealAVX2IntroLoop 2028 2029 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 2030 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2031 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2032 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2033 2034 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 2035 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key 2036 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 2037 2038 // Clamp and store poly key 2039 VPAND polyClampMask<>(SB), DD0, DD0 2040 VMOVDQA DD0, rsStoreAVX2 2041 2042 // Hash AD 2043 MOVQ ad_len+80(FP), itr2 2044 CALL polyHashADInternal<>(SB) 2045 2046 // Can store at least 320 bytes 2047 VPXOR (0*32)(inp), AA0, AA0 2048 VPXOR (1*32)(inp), CC0, CC0 2049 VMOVDQU AA0, (0*32)(oup) 2050 VMOVDQU CC0, (1*32)(oup) 2051 2052 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2053 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 2054 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) 2055 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2056 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 2057 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) 2058 2059 MOVQ $320, itr1 2060 SUBQ $320, inl 2061 LEAQ 320(inp), inp 2062 2063 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 2064 CMPQ inl, $128 2065 JBE sealAVX2SealHash 2066 2067 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 2068 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) 2069 SUBQ $128, inl 2070 LEAQ 128(inp), inp 2071 2072 MOVQ $8, itr1 2073 MOVQ $2, itr2 2074 2075 CMPQ inl, $128 2076 JBE sealAVX2Tail128 2077 CMPQ inl, $256 2078 JBE sealAVX2Tail256 2079 CMPQ inl, $384 2080 JBE sealAVX2Tail384 2081 CMPQ inl, $512 2082 JBE sealAVX2Tail512 2083 2084 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2085 VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2086 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2087 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2088 VMOVDQA ctr3StoreAVX2, DD0 2089 VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 2090 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2091 2092 VMOVDQA CC3, tmpStoreAVX2 2093 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2094 VMOVDQA tmpStoreAVX2, CC3 2095 VMOVDQA CC1, tmpStoreAVX2 2096 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2097 VMOVDQA tmpStoreAVX2, CC1 2098 2099 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 2100 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 2101 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 2102 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2103 2104 VMOVDQA CC3, tmpStoreAVX2 2105 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2106 VMOVDQA tmpStoreAVX2, CC3 2107 VMOVDQA CC1, tmpStoreAVX2 2108 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2109 VMOVDQA tmpStoreAVX2, CC1 2110 2111 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2112 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2113 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2114 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2115 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2116 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2117 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 2118 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2119 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2120 VMOVDQA CC3, tmpStoreAVX2 2121 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2122 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2123 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2124 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2125 VMOVDQA tmpStoreAVX2, CC3 2126 2127 SUBQ $16, oup // Adjust the pointer 2128 MOVQ $9, itr1 2129 JMP sealAVX2InternalLoopStart 2130 2131 sealAVX2MainLoop: 2132 // Load state, increment counter blocks, store the incremented counters 2133 VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2134 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2135 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2136 VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 2137 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2138 MOVQ $10, itr1 2139 2140 sealAVX2InternalLoop: 2141 polyAdd(0*8(oup)) 2142 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2143 polyMulStage1_AVX2 2144 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2145 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 2146 polyMulStage2_AVX2 2147 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2148 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2149 polyMulStage3_AVX2 2150 VMOVDQA CC3, tmpStoreAVX2 2151 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2152 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2153 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2154 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2155 VMOVDQA tmpStoreAVX2, CC3 2156 polyMulReduceStage 2157 2158 sealAVX2InternalLoopStart: 2159 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2160 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2161 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 2162 polyAdd(2*8(oup)) 2163 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2164 polyMulStage1_AVX2 2165 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2166 VMOVDQA CC3, tmpStoreAVX2 2167 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2168 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2169 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2170 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2171 VMOVDQA tmpStoreAVX2, CC3 2172 polyMulStage2_AVX2 2173 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2174 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2175 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2176 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2177 polyMulStage3_AVX2 2178 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2179 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 2180 polyMulReduceStage 2181 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2182 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2183 polyAdd(4*8(oup)) 2184 LEAQ (6*8)(oup), oup 2185 VMOVDQA CC3, tmpStoreAVX2 2186 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2187 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2188 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2189 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2190 VMOVDQA tmpStoreAVX2, CC3 2191 polyMulStage1_AVX2 2192 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2193 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2194 polyMulStage2_AVX2 2195 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 2196 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2197 polyMulStage3_AVX2 2198 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2199 VMOVDQA CC3, tmpStoreAVX2 2200 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2201 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2202 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2203 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2204 VMOVDQA tmpStoreAVX2, CC3 2205 polyMulReduceStage 2206 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2207 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2208 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2209 DECQ itr1 2210 JNE sealAVX2InternalLoop 2211 2212 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 2213 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2214 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2215 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2216 VMOVDQA CC3, tmpStoreAVX2 2217 2218 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 2219 polyAdd(0*8(oup)) 2220 polyMulAVX2 2221 LEAQ (4*8)(oup), oup 2222 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 2223 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 2224 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 2225 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2226 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2227 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2228 2229 // and here 2230 polyAdd(-2*8(oup)) 2231 polyMulAVX2 2232 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2233 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2234 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2235 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2236 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 2237 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 2238 LEAQ (32*16)(inp), inp 2239 SUBQ $(32*16), inl 2240 CMPQ inl, $512 2241 JG sealAVX2MainLoop 2242 2243 // Tail can only hash 480 bytes 2244 polyAdd(0*8(oup)) 2245 polyMulAVX2 2246 polyAdd(2*8(oup)) 2247 polyMulAVX2 2248 LEAQ 32(oup), oup 2249 2250 MOVQ $10, itr1 2251 MOVQ $0, itr2 2252 CMPQ inl, $128 2253 JBE sealAVX2Tail128 2254 CMPQ inl, $256 2255 JBE sealAVX2Tail256 2256 CMPQ inl, $384 2257 JBE sealAVX2Tail384 2258 JMP sealAVX2Tail512 2259 2260 // ---------------------------------------------------------------------------- 2261 // Special optimization for buffers smaller than 193 bytes 2262 seal192AVX2: 2263 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 2264 VMOVDQA AA0, AA1 2265 VMOVDQA BB0, BB1 2266 VMOVDQA CC0, CC1 2267 VPADDD avx2IncMask<>(SB), DD0, DD1 2268 VMOVDQA AA0, AA2 2269 VMOVDQA BB0, BB2 2270 VMOVDQA CC0, CC2 2271 VMOVDQA DD0, DD2 2272 VMOVDQA DD1, TT3 2273 MOVQ $10, itr2 2274 2275 sealAVX2192InnerCipherLoop: 2276 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2277 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2278 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2279 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2280 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2281 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2282 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2283 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2284 DECQ itr2 2285 JNE sealAVX2192InnerCipherLoop 2286 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 2287 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 2288 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 2289 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 2290 VPERM2I128 $0x02, AA0, BB0, TT0 2291 2292 // Clamp and store poly key 2293 VPAND polyClampMask<>(SB), TT0, TT0 2294 VMOVDQA TT0, rsStoreAVX2 2295 2296 // Stream for up to 192 bytes 2297 VPERM2I128 $0x13, AA0, BB0, AA0 2298 VPERM2I128 $0x13, CC0, DD0, BB0 2299 VPERM2I128 $0x02, AA1, BB1, CC0 2300 VPERM2I128 $0x02, CC1, DD1, DD0 2301 VPERM2I128 $0x13, AA1, BB1, AA1 2302 VPERM2I128 $0x13, CC1, DD1, BB1 2303 2304 sealAVX2ShortSeal: 2305 // Hash aad 2306 MOVQ ad_len+80(FP), itr2 2307 CALL polyHashADInternal<>(SB) 2308 XORQ itr1, itr1 2309 2310 sealAVX2SealHash: 2311 // itr1 holds the number of bytes encrypted but not yet hashed 2312 CMPQ itr1, $16 2313 JB sealAVX2ShortSealLoop 2314 polyAdd(0(oup)) 2315 polyMul 2316 SUBQ $16, itr1 2317 ADDQ $16, oup 2318 JMP sealAVX2SealHash 2319 2320 sealAVX2ShortSealLoop: 2321 CMPQ inl, $32 2322 JB sealAVX2ShortTail32 2323 SUBQ $32, inl 2324 2325 // Load for encryption 2326 VPXOR (inp), AA0, AA0 2327 VMOVDQU AA0, (oup) 2328 LEAQ (1*32)(inp), inp 2329 2330 // Now can hash 2331 polyAdd(0*8(oup)) 2332 polyMulAVX2 2333 polyAdd(2*8(oup)) 2334 polyMulAVX2 2335 LEAQ (1*32)(oup), oup 2336 2337 // Shift stream left 2338 VMOVDQA BB0, AA0 2339 VMOVDQA CC0, BB0 2340 VMOVDQA DD0, CC0 2341 VMOVDQA AA1, DD0 2342 VMOVDQA BB1, AA1 2343 VMOVDQA CC1, BB1 2344 VMOVDQA DD1, CC1 2345 VMOVDQA AA2, DD1 2346 VMOVDQA BB2, AA2 2347 JMP sealAVX2ShortSealLoop 2348 2349 sealAVX2ShortTail32: 2350 CMPQ inl, $16 2351 VMOVDQA A0, A1 2352 JB sealAVX2ShortDone 2353 2354 SUBQ $16, inl 2355 2356 // Load for encryption 2357 VPXOR (inp), A0, T0 2358 VMOVDQU T0, (oup) 2359 LEAQ (1*16)(inp), inp 2360 2361 // Hash 2362 polyAdd(0*8(oup)) 2363 polyMulAVX2 2364 LEAQ (1*16)(oup), oup 2365 VPERM2I128 $0x11, AA0, AA0, AA0 2366 VMOVDQA A0, A1 2367 2368 sealAVX2ShortDone: 2369 VZEROUPPER 2370 JMP sealSSETail 2371 2372 // ---------------------------------------------------------------------------- 2373 // Special optimization for buffers smaller than 321 bytes 2374 seal320AVX2: 2375 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 2376 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1 2377 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2 2378 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 2379 MOVQ $10, itr2 2380 2381 sealAVX2320InnerCipherLoop: 2382 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2383 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2384 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2385 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2386 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2387 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2388 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2389 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2390 DECQ itr2 2391 JNE sealAVX2320InnerCipherLoop 2392 2393 VMOVDQA chacha20Constants<>(SB), TT0 2394 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 2395 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 2396 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 2397 VMOVDQA avx2IncMask<>(SB), TT0 2398 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 2399 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 2400 VPADDD TT3, DD2, DD2 2401 2402 // Clamp and store poly key 2403 VPERM2I128 $0x02, AA0, BB0, TT0 2404 VPAND polyClampMask<>(SB), TT0, TT0 2405 VMOVDQA TT0, rsStoreAVX2 2406 2407 // Stream for up to 320 bytes 2408 VPERM2I128 $0x13, AA0, BB0, AA0 2409 VPERM2I128 $0x13, CC0, DD0, BB0 2410 VPERM2I128 $0x02, AA1, BB1, CC0 2411 VPERM2I128 $0x02, CC1, DD1, DD0 2412 VPERM2I128 $0x13, AA1, BB1, AA1 2413 VPERM2I128 $0x13, CC1, DD1, BB1 2414 VPERM2I128 $0x02, AA2, BB2, CC1 2415 VPERM2I128 $0x02, CC2, DD2, DD1 2416 VPERM2I128 $0x13, AA2, BB2, AA2 2417 VPERM2I128 $0x13, CC2, DD2, BB2 2418 JMP sealAVX2ShortSeal 2419 2420 // ---------------------------------------------------------------------------- 2421 // Special optimization for the last 128 bytes of ciphertext 2422 sealAVX2Tail128: 2423 // Need to decrypt up to 128 bytes - prepare two blocks 2424 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2425 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2426 VMOVDQA chacha20Constants<>(SB), AA0 2427 VMOVDQA state1StoreAVX2, BB0 2428 VMOVDQA state2StoreAVX2, CC0 2429 VMOVDQA ctr3StoreAVX2, DD0 2430 VPADDD avx2IncMask<>(SB), DD0, DD0 2431 VMOVDQA DD0, DD1 2432 2433 sealAVX2Tail128LoopA: 2434 polyAdd(0(oup)) 2435 polyMul 2436 LEAQ 16(oup), oup 2437 2438 sealAVX2Tail128LoopB: 2439 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2440 polyAdd(0(oup)) 2441 polyMul 2442 VPALIGNR $4, BB0, BB0, BB0 2443 VPALIGNR $8, CC0, CC0, CC0 2444 VPALIGNR $12, DD0, DD0, DD0 2445 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2446 polyAdd(16(oup)) 2447 polyMul 2448 LEAQ 32(oup), oup 2449 VPALIGNR $12, BB0, BB0, BB0 2450 VPALIGNR $8, CC0, CC0, CC0 2451 VPALIGNR $4, DD0, DD0, DD0 2452 DECQ itr1 2453 JG sealAVX2Tail128LoopA 2454 DECQ itr2 2455 JGE sealAVX2Tail128LoopB 2456 2457 VPADDD chacha20Constants<>(SB), AA0, AA1 2458 VPADDD state1StoreAVX2, BB0, BB1 2459 VPADDD state2StoreAVX2, CC0, CC1 2460 VPADDD DD1, DD0, DD1 2461 2462 VPERM2I128 $0x02, AA1, BB1, AA0 2463 VPERM2I128 $0x02, CC1, DD1, BB0 2464 VPERM2I128 $0x13, AA1, BB1, CC0 2465 VPERM2I128 $0x13, CC1, DD1, DD0 2466 JMP sealAVX2ShortSealLoop 2467 2468 // ---------------------------------------------------------------------------- 2469 // Special optimization for the last 256 bytes of ciphertext 2470 sealAVX2Tail256: 2471 // Need to decrypt up to 256 bytes - prepare two blocks 2472 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2473 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2474 VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA chacha20Constants<>(SB), AA1 2475 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 2476 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 2477 VMOVDQA ctr3StoreAVX2, DD0 2478 VPADDD avx2IncMask<>(SB), DD0, DD0 2479 VPADDD avx2IncMask<>(SB), DD0, DD1 2480 VMOVDQA DD0, TT1 2481 VMOVDQA DD1, TT2 2482 2483 sealAVX2Tail256LoopA: 2484 polyAdd(0(oup)) 2485 polyMul 2486 LEAQ 16(oup), oup 2487 2488 sealAVX2Tail256LoopB: 2489 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2490 polyAdd(0(oup)) 2491 polyMul 2492 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2493 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2494 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2495 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2496 polyAdd(16(oup)) 2497 polyMul 2498 LEAQ 32(oup), oup 2499 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2500 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2501 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2502 DECQ itr1 2503 JG sealAVX2Tail256LoopA 2504 DECQ itr2 2505 JGE sealAVX2Tail256LoopB 2506 2507 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1 2508 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 2509 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 2510 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 2511 VPERM2I128 $0x02, AA0, BB0, TT0 2512 VPERM2I128 $0x02, CC0, DD0, TT1 2513 VPERM2I128 $0x13, AA0, BB0, TT2 2514 VPERM2I128 $0x13, CC0, DD0, TT3 2515 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2516 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2517 MOVQ $128, itr1 2518 LEAQ 128(inp), inp 2519 SUBQ $128, inl 2520 VPERM2I128 $0x02, AA1, BB1, AA0 2521 VPERM2I128 $0x02, CC1, DD1, BB0 2522 VPERM2I128 $0x13, AA1, BB1, CC0 2523 VPERM2I128 $0x13, CC1, DD1, DD0 2524 2525 JMP sealAVX2SealHash 2526 2527 // ---------------------------------------------------------------------------- 2528 // Special optimization for the last 384 bytes of ciphertext 2529 sealAVX2Tail384: 2530 // Need to decrypt up to 384 bytes - prepare two blocks 2531 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2532 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2533 VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 2534 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 2535 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 2536 VMOVDQA ctr3StoreAVX2, DD0 2537 VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2 2538 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 2539 2540 sealAVX2Tail384LoopA: 2541 polyAdd(0(oup)) 2542 polyMul 2543 LEAQ 16(oup), oup 2544 2545 sealAVX2Tail384LoopB: 2546 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2547 polyAdd(0(oup)) 2548 polyMul 2549 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2550 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2551 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2552 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2553 polyAdd(16(oup)) 2554 polyMul 2555 LEAQ 32(oup), oup 2556 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2557 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2558 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2559 DECQ itr1 2560 JG sealAVX2Tail384LoopA 2561 DECQ itr2 2562 JGE sealAVX2Tail384LoopB 2563 2564 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2 2565 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 2566 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 2567 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 2568 VPERM2I128 $0x02, AA0, BB0, TT0 2569 VPERM2I128 $0x02, CC0, DD0, TT1 2570 VPERM2I128 $0x13, AA0, BB0, TT2 2571 VPERM2I128 $0x13, CC0, DD0, TT3 2572 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2573 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2574 VPERM2I128 $0x02, AA1, BB1, TT0 2575 VPERM2I128 $0x02, CC1, DD1, TT1 2576 VPERM2I128 $0x13, AA1, BB1, TT2 2577 VPERM2I128 $0x13, CC1, DD1, TT3 2578 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 2579 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 2580 MOVQ $256, itr1 2581 LEAQ 256(inp), inp 2582 SUBQ $256, inl 2583 VPERM2I128 $0x02, AA2, BB2, AA0 2584 VPERM2I128 $0x02, CC2, DD2, BB0 2585 VPERM2I128 $0x13, AA2, BB2, CC0 2586 VPERM2I128 $0x13, CC2, DD2, DD0 2587 2588 JMP sealAVX2SealHash 2589 2590 // ---------------------------------------------------------------------------- 2591 // Special optimization for the last 512 bytes of ciphertext 2592 sealAVX2Tail512: 2593 // Need to decrypt up to 512 bytes - prepare two blocks 2594 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2595 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2596 VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2597 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2598 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2599 VMOVDQA ctr3StoreAVX2, DD0 2600 VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 2601 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2602 2603 sealAVX2Tail512LoopA: 2604 polyAdd(0(oup)) 2605 polyMul 2606 LEAQ 16(oup), oup 2607 2608 sealAVX2Tail512LoopB: 2609 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2610 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2611 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 2612 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2613 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2614 VMOVDQA CC3, tmpStoreAVX2 2615 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2616 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2617 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2618 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2619 VMOVDQA tmpStoreAVX2, CC3 2620 polyAdd(0*8(oup)) 2621 polyMulAVX2 2622 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2623 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2624 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 2625 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2626 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2627 VMOVDQA CC3, tmpStoreAVX2 2628 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2629 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2630 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2631 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2632 VMOVDQA tmpStoreAVX2, CC3 2633 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2634 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2635 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2636 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2637 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2638 VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 2639 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2640 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2641 polyAdd(2*8(oup)) 2642 polyMulAVX2 2643 LEAQ (4*8)(oup), oup 2644 VMOVDQA CC3, tmpStoreAVX2 2645 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2646 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2647 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2648 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2649 VMOVDQA tmpStoreAVX2, CC3 2650 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2651 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2652 VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 2653 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2654 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2655 VMOVDQA CC3, tmpStoreAVX2 2656 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2657 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2658 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2659 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2660 VMOVDQA tmpStoreAVX2, CC3 2661 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2662 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2663 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2664 2665 DECQ itr1 2666 JG sealAVX2Tail512LoopA 2667 DECQ itr2 2668 JGE sealAVX2Tail512LoopB 2669 2670 VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 2671 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2672 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2673 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2674 VMOVDQA CC3, tmpStoreAVX2 2675 VPERM2I128 $0x02, AA0, BB0, CC3 2676 VPXOR (0*32)(inp), CC3, CC3 2677 VMOVDQU CC3, (0*32)(oup) 2678 VPERM2I128 $0x02, CC0, DD0, CC3 2679 VPXOR (1*32)(inp), CC3, CC3 2680 VMOVDQU CC3, (1*32)(oup) 2681 VPERM2I128 $0x13, AA0, BB0, CC3 2682 VPXOR (2*32)(inp), CC3, CC3 2683 VMOVDQU CC3, (2*32)(oup) 2684 VPERM2I128 $0x13, CC0, DD0, CC3 2685 VPXOR (3*32)(inp), CC3, CC3 2686 VMOVDQU CC3, (3*32)(oup) 2687 2688 VPERM2I128 $0x02, AA1, BB1, AA0 2689 VPERM2I128 $0x02, CC1, DD1, BB0 2690 VPERM2I128 $0x13, AA1, BB1, CC0 2691 VPERM2I128 $0x13, CC1, DD1, DD0 2692 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2693 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2694 2695 VPERM2I128 $0x02, AA2, BB2, AA0 2696 VPERM2I128 $0x02, CC2, DD2, BB0 2697 VPERM2I128 $0x13, AA2, BB2, CC0 2698 VPERM2I128 $0x13, CC2, DD2, DD0 2699 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2700 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2701 2702 MOVQ $384, itr1 2703 LEAQ 384(inp), inp 2704 SUBQ $384, inl 2705 VPERM2I128 $0x02, AA3, BB3, AA0 2706 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 2707 VPERM2I128 $0x13, AA3, BB3, CC0 2708 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2709 2710 JMP sealAVX2SealHash 2711 2712 // func haveSSSE3() bool 2713 TEXT haveSSSE3(SB), NOSPLIT, $0 2714 XORQ AX, AX 2715 INCL AX 2716 CPUID 2717 SHRQ $9, CX 2718 ANDQ $1, CX 2719 MOVB CX, ret+0(FP) 2720 RET 2721 2722