Home | History | Annotate | Download | only in aes
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
      6 // The implementation uses some optimization as described in:
      7 // [1] Gueron, S., Kounavis, M.E.: Intel Carry-Less Multiplication
      8 //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
      9 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
     10 //     Hardware
     11 
     12 #include "textflag.h"
     13 
     14 #define B0 X0
     15 #define B1 X1
     16 #define B2 X2
     17 #define B3 X3
     18 #define B4 X4
     19 #define B5 X5
     20 #define B6 X6
     21 #define B7 X7
     22 
     23 #define ACC0 X8
     24 #define ACC1 X9
     25 #define ACCM X10
     26 
     27 #define T0 X11
     28 #define T1 X12
     29 #define T2 X13
     30 #define POLY X14
     31 #define BSWAP X15
     32 
     33 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
     34 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
     35 
     36 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
     37 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
     38 
     39 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
     40 DATA andMask<>+0x08(SB)/8, $0x0000000000000000
     41 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
     42 DATA andMask<>+0x18(SB)/8, $0x0000000000000000
     43 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
     44 DATA andMask<>+0x28(SB)/8, $0x0000000000000000
     45 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
     46 DATA andMask<>+0x38(SB)/8, $0x0000000000000000
     47 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
     48 DATA andMask<>+0x48(SB)/8, $0x0000000000000000
     49 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
     50 DATA andMask<>+0x58(SB)/8, $0x0000000000000000
     51 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
     52 DATA andMask<>+0x68(SB)/8, $0x0000000000000000
     53 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
     54 DATA andMask<>+0x78(SB)/8, $0x0000000000000000
     55 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
     56 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
     57 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
     58 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
     59 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
     60 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
     61 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
     62 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
     63 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
     64 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
     65 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
     66 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
     67 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
     68 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
     69 
     70 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
     71 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
     72 GLOBL andMask<>(SB), (NOPTR+RODATA), $240
     73 
     74 // func hasGCMAsm() bool
     75 // returns whether AES-NI AND CLMUL-NI are supported
     76 TEXT hasGCMAsm(SB),NOSPLIT,$0
     77 	XORQ AX, AX
     78 	INCL AX
     79 	CPUID
     80 	MOVQ CX, DX
     81 	SHRQ $25, CX
     82 	SHRQ $1, DX
     83 	ANDQ DX, CX
     84 	ANDQ $1, CX
     85 	MOVB CX, ret+0(FP)
     86 	RET
     87 
     88 // func aesEncBlock(dst, src *[16]byte, ks []uint32)
     89 TEXT aesEncBlock(SB),NOSPLIT,$0
     90 	MOVQ dst+0(FP), DI
     91 	MOVQ src+8(FP), SI
     92 	MOVQ ks_base+16(FP), DX
     93 	MOVQ ks_len+24(FP), CX
     94 
     95 	SHRQ $2, CX
     96 	DECQ CX
     97 
     98 	MOVOU (SI), X0
     99 	MOVOU (16*0)(DX), X1
    100 	PXOR X1, X0
    101 	MOVOU (16*1)(DX), X1
    102 	AESENC X1, X0
    103 	MOVOU (16*2)(DX), X1
    104 	AESENC X1, X0
    105 	MOVOU (16*3)(DX), X1
    106 	AESENC X1, X0
    107 	MOVOU (16*4)(DX), X1
    108 	AESENC X1, X0
    109 	MOVOU (16*5)(DX), X1
    110 	AESENC X1, X0
    111 	MOVOU (16*6)(DX), X1
    112 	AESENC X1, X0
    113 	MOVOU (16*7)(DX), X1
    114 	AESENC X1, X0
    115 	MOVOU (16*8)(DX), X1
    116 	AESENC X1, X0
    117 	MOVOU (16*9)(DX), X1
    118 	AESENC X1, X0
    119 	MOVOU (16*10)(DX), X1
    120 	CMPQ CX, $12
    121 	JB encLast
    122 	AESENC X1, X0
    123 	MOVOU (16*11)(DX), X1
    124 	AESENC X1, X0
    125 	MOVOU (16*12)(DX), X1
    126 	JE encLast
    127 	AESENC X1, X0
    128 	MOVOU (16*13)(DX), X1
    129 	AESENC X1, X0
    130 	MOVOU (16*14)(DX), X1
    131 
    132 encLast:
    133 	AESENCLAST X1, X0
    134 	MOVOU X0, (DI)
    135 
    136 	RET
    137 
    138 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    139 TEXT gcmAesFinish(SB),NOSPLIT,$0
    140 #define pTbl DI
    141 #define tMsk SI
    142 #define tPtr DX
    143 #define plen AX
    144 #define dlen CX
    145 
    146 	MOVQ productTable+0(FP), pTbl
    147 	MOVQ tagMask+8(FP), tMsk
    148 	MOVQ T+16(FP), tPtr
    149 	MOVQ pLen+24(FP), plen
    150 	MOVQ dLen+32(FP), dlen
    151 
    152 	MOVOU (tPtr), ACC0
    153 	MOVOU (tMsk), T2
    154 
    155 	MOVOU bswapMask<>(SB), BSWAP
    156 	MOVOU gcmPoly<>(SB), POLY
    157 
    158 	SHLQ $3, plen
    159 	SHLQ $3, dlen
    160 
    161 	MOVQ plen, B0
    162 	PINSRQ $1, dlen, B0
    163 
    164 	PXOR ACC0, B0
    165 
    166 	MOVOU (16*14)(pTbl), ACC0
    167 	MOVOU (16*15)(pTbl), ACCM
    168 	MOVOU ACC0, ACC1
    169 
    170 	PCLMULQDQ $0x00, B0, ACC0
    171 	PCLMULQDQ $0x11, B0, ACC1
    172 	PSHUFD $78, B0, T0
    173 	PXOR B0, T0
    174 	PCLMULQDQ $0x00, T0, ACCM
    175 
    176 	PXOR ACC0, ACCM
    177 	PXOR ACC1, ACCM
    178 	MOVOU ACCM, T0
    179 	PSRLDQ $8, ACCM
    180 	PSLLDQ $8, T0
    181 	PXOR ACCM, ACC1
    182 	PXOR T0, ACC0
    183 
    184 	MOVOU POLY, T0
    185 	PCLMULQDQ $0x01, ACC0, T0
    186 	PSHUFD $78, ACC0, ACC0
    187 	PXOR T0, ACC0
    188 
    189 	MOVOU POLY, T0
    190 	PCLMULQDQ $0x01, ACC0, T0
    191 	PSHUFD $78, ACC0, ACC0
    192 	PXOR T0, ACC0
    193 
    194 	PXOR ACC1, ACC0
    195 
    196 	PSHUFB BSWAP, ACC0
    197 	PXOR T2, ACC0
    198 	MOVOU ACC0, (tPtr)
    199 
    200 	RET
    201 #undef pTbl
    202 #undef tMsk
    203 #undef tPtr
    204 #undef plen
    205 #undef dlen
    206 
    207 // func gcmAesInit(productTable *[256]byte, ks []uint32)
    208 TEXT gcmAesInit(SB),NOSPLIT,$0
    209 #define dst DI
    210 #define KS SI
    211 #define NR DX
    212 
    213 	MOVQ productTable+0(FP), dst
    214 	MOVQ ks_base+8(FP), KS
    215 	MOVQ ks_len+16(FP), NR
    216 
    217 	SHRQ $2, NR
    218 	DECQ NR
    219 
    220 	MOVOU bswapMask<>(SB), BSWAP
    221 	MOVOU gcmPoly<>(SB), POLY
    222 
    223 	// Encrypt block 0, with the AES key to generate the hash key H
    224 	MOVOU (16*0)(KS), B0
    225 	MOVOU (16*1)(KS), T0
    226 	AESENC T0, B0
    227 	MOVOU (16*2)(KS), T0
    228 	AESENC T0, B0
    229 	MOVOU (16*3)(KS), T0
    230 	AESENC T0, B0
    231 	MOVOU (16*4)(KS), T0
    232 	AESENC T0, B0
    233 	MOVOU (16*5)(KS), T0
    234 	AESENC T0, B0
    235 	MOVOU (16*6)(KS), T0
    236 	AESENC T0, B0
    237 	MOVOU (16*7)(KS), T0
    238 	AESENC T0, B0
    239 	MOVOU (16*8)(KS), T0
    240 	AESENC T0, B0
    241 	MOVOU (16*9)(KS), T0
    242 	AESENC T0, B0
    243 	MOVOU (16*10)(KS), T0
    244 	CMPQ NR, $12
    245 	JB initEncLast
    246 	AESENC T0, B0
    247 	MOVOU (16*11)(KS), T0
    248 	AESENC T0, B0
    249 	MOVOU (16*12)(KS), T0
    250 	JE initEncLast
    251 	AESENC T0, B0
    252 	MOVOU (16*13)(KS), T0
    253 	AESENC T0, B0
    254 	MOVOU (16*14)(KS), T0
    255 initEncLast:
    256 	AESENCLAST T0, B0
    257 
    258 	PSHUFB BSWAP, B0
    259 	// H * 2
    260 	PSHUFD $0xff, B0, T0
    261 	MOVOU B0, T1
    262 	PSRAL $31, T0
    263 	PAND POLY, T0
    264 	PSRLL $31, T1
    265 	PSLLDQ $4, T1
    266 	PSLLL $1, B0
    267 	PXOR T0, B0
    268 	PXOR T1, B0
    269 	// Karatsuba pre-computations
    270 	MOVOU B0, (16*14)(dst)
    271 	PSHUFD $78, B0, B1
    272 	PXOR B0, B1
    273 	MOVOU B1, (16*15)(dst)
    274 
    275 	MOVOU B0, B2
    276 	MOVOU B1, B3
    277 	// Now prepare powers of H and pre-computations for them
    278 	MOVQ $7, AX
    279 
    280 initLoop:
    281 		MOVOU B2, T0
    282 		MOVOU B2, T1
    283 		MOVOU B3, T2
    284 		PCLMULQDQ $0x00, B0, T0
    285 		PCLMULQDQ $0x11, B0, T1
    286 		PCLMULQDQ $0x00, B1, T2
    287 
    288 		PXOR T0, T2
    289 		PXOR T1, T2
    290 		MOVOU T2, B4
    291 		PSLLDQ $8, B4
    292 		PSRLDQ $8, T2
    293 		PXOR B4, T0
    294 		PXOR T2, T1
    295 
    296 		MOVOU POLY, B2
    297 		PCLMULQDQ $0x01, T0, B2
    298 		PSHUFD $78, T0, T0
    299 		PXOR B2, T0
    300 		MOVOU POLY, B2
    301 		PCLMULQDQ $0x01, T0, B2
    302 		PSHUFD $78, T0, T0
    303 		PXOR T0, B2
    304 		PXOR T1, B2
    305 
    306 		MOVOU B2, (16*12)(dst)
    307 		PSHUFD $78, B2, B3
    308 		PXOR B2, B3
    309 		MOVOU B3, (16*13)(dst)
    310 
    311 		DECQ AX
    312 		LEAQ (-16*2)(dst), dst
    313 	JNE initLoop
    314 
    315 	RET
    316 #undef NR
    317 #undef KS
    318 #undef dst
    319 
    320 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
    321 TEXT gcmAesData(SB),NOSPLIT,$0
    322 #define pTbl DI
    323 #define aut SI
    324 #define tPtr CX
    325 #define autLen DX
    326 
    327 	MOVQ productTable+0(FP), pTbl
    328 	MOVQ data_base+8(FP), aut
    329 	MOVQ data_len+16(FP), autLen
    330 	MOVQ T+32(FP), tPtr
    331 
    332 	PXOR ACC0, ACC0
    333 	MOVOU bswapMask<>(SB), BSWAP
    334 	MOVOU gcmPoly<>(SB), POLY
    335 
    336 	MOVOU (16*14)(pTbl), T1
    337 	MOVOU (16*15)(pTbl), T2
    338 
    339 	TESTQ autLen, autLen
    340 	JEQ dataBail
    341 
    342 	CMPQ autLen, $13	// optimize the TLS case
    343 	JNE dataSinglesLoop
    344 
    345 	PXOR B0, B0
    346 	MOVQ (aut), B0
    347 	PINSRD $2, 8(aut), B0
    348 	PINSRB $12, 12(aut), B0
    349 	XORQ autLen, autLen
    350 	JMP dataMul
    351 
    352 dataSinglesLoop:
    353 
    354 		CMPQ autLen, $16
    355 		JB dataEnd
    356 		SUBQ $16, autLen
    357 
    358 		MOVOU (aut), B0
    359 dataMul:
    360 		PSHUFB BSWAP, B0
    361 		PXOR ACC0, B0
    362 
    363 		MOVOU T1, ACC0
    364 		MOVOU T2, ACCM
    365 		MOVOU T1, ACC1
    366 
    367 		PSHUFD $78, B0, T0
    368 		PXOR B0, T0
    369 		PCLMULQDQ $0x00, B0, ACC0
    370 		PCLMULQDQ $0x11, B0, ACC1
    371 		PCLMULQDQ $0x00, T0, ACCM
    372 
    373 		PXOR ACC0, ACCM
    374 		PXOR ACC1, ACCM
    375 		MOVOU ACCM, T0
    376 		PSRLDQ $8, ACCM
    377 		PSLLDQ $8, T0
    378 		PXOR ACCM, ACC1
    379 		PXOR T0, ACC0
    380 
    381 		MOVOU POLY, T0
    382 		PCLMULQDQ $0x01, ACC0, T0
    383 		PSHUFD $78, ACC0, ACC0
    384 		PXOR T0, ACC0
    385 
    386 		MOVOU POLY, T0
    387 		PCLMULQDQ $0x01, ACC0, T0
    388 		PSHUFD $78, ACC0, ACC0
    389 		PXOR T0, ACC0
    390 		PXOR ACC1, ACC0
    391 
    392 		LEAQ 16(aut), aut
    393 
    394 	JMP dataSinglesLoop
    395 
    396 dataEnd:
    397 
    398 	TESTQ autLen, autLen
    399 	JEQ dataBail
    400 
    401 	PXOR B0, B0
    402 	LEAQ -1(aut)(autLen*1), aut
    403 
    404 dataLoadLoop:
    405 
    406 		PSLLDQ $1, B0
    407 		PINSRB $0, (aut), B0
    408 
    409 		LEAQ -1(aut), aut
    410 		DECQ autLen
    411 		JNE dataLoadLoop
    412 
    413 	JMP dataMul
    414 
    415 dataBail:
    416 	MOVOU ACC0, (tPtr)
    417 	RET
    418 #undef pTbl
    419 #undef aut
    420 #undef tPtr
    421 #undef autLen
    422 
    423 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
    424 TEXT gcmAesEnc(SB),0,$256-96
    425 #define pTbl DI
    426 #define ctx DX
    427 #define ctrPtr CX
    428 #define ptx SI
    429 #define ks AX
    430 #define tPtr R8
    431 #define ptxLen R9
    432 #define aluCTR R10
    433 #define aluTMP R11
    434 #define aluK R12
    435 #define NR R13
    436 
    437 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
    438 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
    439 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
    440 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
    441 #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
    442 #define combinedRound(i) \
    443 	MOVOU (16*i)(ks), T0;\
    444 	AESENC T0, B0;\
    445 	AESENC T0, B1;\
    446 	AESENC T0, B2;\
    447 	AESENC T0, B3;\
    448 	 MOVOU (16*(i*2))(pTbl), T1;\
    449 	 MOVOU T1, T2;\
    450 	AESENC T0, B4;\
    451 	AESENC T0, B5;\
    452 	AESENC T0, B6;\
    453 	AESENC T0, B7;\
    454 	 MOVOU (16*i)(SP), T0;\
    455 	 PCLMULQDQ $0x00, T0, T1;\
    456 	 PXOR T1, ACC0;\
    457 	 PSHUFD $78, T0, T1;\
    458 	 PCLMULQDQ $0x11, T0, T2;\
    459 	 PXOR T1, T0;\
    460 	 PXOR T2, ACC1;\
    461 	 MOVOU (16*(i*2+1))(pTbl), T2;\
    462 	 PCLMULQDQ $0x00, T2, T0;\
    463 	 PXOR T0, ACCM
    464 #define mulRound(i) \
    465 	MOVOU (16*i)(SP), T0;\
    466 	MOVOU (16*(i*2))(pTbl), T1;\
    467 	MOVOU T1, T2;\
    468 	PCLMULQDQ $0x00, T0, T1;\
    469 	PXOR T1, ACC0;\
    470 	PCLMULQDQ $0x11, T0, T2;\
    471 	PXOR T2, ACC1;\
    472 	PSHUFD $78, T0, T1;\
    473 	PXOR T1, T0;\
    474 	MOVOU (16*(i*2+1))(pTbl), T1;\
    475 	PCLMULQDQ $0x00, T0, T1;\
    476 	PXOR T1, ACCM
    477 
    478 	MOVQ productTable+0(FP), pTbl
    479 	MOVQ dst+8(FP), ctx
    480 	MOVQ src_base+32(FP), ptx
    481 	MOVQ src_len+40(FP), ptxLen
    482 	MOVQ ctr+56(FP), ctrPtr
    483 	MOVQ T+64(FP), tPtr
    484 	MOVQ ks_base+72(FP), ks
    485 	MOVQ ks_len+80(FP), NR
    486 
    487 	SHRQ $2, NR
    488 	DECQ NR
    489 
    490 	MOVOU bswapMask<>(SB), BSWAP
    491 	MOVOU gcmPoly<>(SB), POLY
    492 
    493 	MOVOU (tPtr), ACC0
    494 	PXOR ACC1, ACC1
    495 	PXOR ACCM, ACCM
    496 	MOVOU (ctrPtr), B0
    497 	MOVL (3*4)(ctrPtr), aluCTR
    498 	MOVOU (ks), T0
    499 	MOVL (3*4)(ks), aluK
    500 	BSWAPL aluCTR
    501 	BSWAPL aluK
    502 
    503 	PXOR B0, T0
    504 	MOVOU T0, (8*16 + 0*16)(SP)
    505 	increment(0)
    506 
    507 	CMPQ ptxLen, $128
    508 	JB gcmAesEncSingles
    509 	SUBQ $128, ptxLen
    510 
    511 	// We have at least 8 blocks to encrypt, prepare the rest of the counters
    512 	MOVOU T0, (8*16 + 1*16)(SP)
    513 	increment(1)
    514 	MOVOU T0, (8*16 + 2*16)(SP)
    515 	increment(2)
    516 	MOVOU T0, (8*16 + 3*16)(SP)
    517 	increment(3)
    518 	MOVOU T0, (8*16 + 4*16)(SP)
    519 	increment(4)
    520 	MOVOU T0, (8*16 + 5*16)(SP)
    521 	increment(5)
    522 	MOVOU T0, (8*16 + 6*16)(SP)
    523 	increment(6)
    524 	MOVOU T0, (8*16 + 7*16)(SP)
    525 	increment(7)
    526 
    527 	MOVOU (8*16 + 0*16)(SP), B0
    528 	MOVOU (8*16 + 1*16)(SP), B1
    529 	MOVOU (8*16 + 2*16)(SP), B2
    530 	MOVOU (8*16 + 3*16)(SP), B3
    531 	MOVOU (8*16 + 4*16)(SP), B4
    532 	MOVOU (8*16 + 5*16)(SP), B5
    533 	MOVOU (8*16 + 6*16)(SP), B6
    534 	MOVOU (8*16 + 7*16)(SP), B7
    535 
    536 	aesRound(1)
    537 	increment(0)
    538 	aesRound(2)
    539 	increment(1)
    540 	aesRound(3)
    541 	increment(2)
    542 	aesRound(4)
    543 	increment(3)
    544 	aesRound(5)
    545 	increment(4)
    546 	aesRound(6)
    547 	increment(5)
    548 	aesRound(7)
    549 	increment(6)
    550 	aesRound(8)
    551 	increment(7)
    552 	aesRound(9)
    553 	MOVOU (16*10)(ks), T0
    554 	CMPQ NR, $12
    555 	JB encLast1
    556 	aesRnd(T0)
    557 	aesRound(11)
    558 	MOVOU (16*12)(ks), T0
    559 	JE encLast1
    560 	aesRnd(T0)
    561 	aesRound(13)
    562 	MOVOU (16*14)(ks), T0
    563 encLast1:
    564 	aesRndLast(T0)
    565 
    566 	MOVOU (16*0)(ptx), T0
    567 	PXOR T0, B0
    568 	MOVOU (16*1)(ptx), T0
    569 	PXOR T0, B1
    570 	MOVOU (16*2)(ptx), T0
    571 	PXOR T0, B2
    572 	MOVOU (16*3)(ptx), T0
    573 	PXOR T0, B3
    574 	MOVOU (16*4)(ptx), T0
    575 	PXOR T0, B4
    576 	MOVOU (16*5)(ptx), T0
    577 	PXOR T0, B5
    578 	MOVOU (16*6)(ptx), T0
    579 	PXOR T0, B6
    580 	MOVOU (16*7)(ptx), T0
    581 	PXOR T0, B7
    582 
    583 	MOVOU B0, (16*0)(ctx)
    584 	PSHUFB BSWAP, B0
    585 	PXOR ACC0, B0
    586 	MOVOU B1, (16*1)(ctx)
    587 	PSHUFB BSWAP, B1
    588 	MOVOU B2, (16*2)(ctx)
    589 	PSHUFB BSWAP, B2
    590 	MOVOU B3, (16*3)(ctx)
    591 	PSHUFB BSWAP, B3
    592 	MOVOU B4, (16*4)(ctx)
    593 	PSHUFB BSWAP, B4
    594 	MOVOU B5, (16*5)(ctx)
    595 	PSHUFB BSWAP, B5
    596 	MOVOU B6, (16*6)(ctx)
    597 	PSHUFB BSWAP, B6
    598 	MOVOU B7, (16*7)(ctx)
    599 	PSHUFB BSWAP, B7
    600 
    601 	MOVOU B0, (16*0)(SP)
    602 	MOVOU B1, (16*1)(SP)
    603 	MOVOU B2, (16*2)(SP)
    604 	MOVOU B3, (16*3)(SP)
    605 	MOVOU B4, (16*4)(SP)
    606 	MOVOU B5, (16*5)(SP)
    607 	MOVOU B6, (16*6)(SP)
    608 	MOVOU B7, (16*7)(SP)
    609 
    610 	LEAQ 128(ptx), ptx
    611 	LEAQ 128(ctx), ctx
    612 
    613 gcmAesEncOctetsLoop:
    614 
    615 		CMPQ ptxLen, $128
    616 		JB gcmAesEncOctetsEnd
    617 		SUBQ $128, ptxLen
    618 
    619 		MOVOU (8*16 + 0*16)(SP), B0
    620 		MOVOU (8*16 + 1*16)(SP), B1
    621 		MOVOU (8*16 + 2*16)(SP), B2
    622 		MOVOU (8*16 + 3*16)(SP), B3
    623 		MOVOU (8*16 + 4*16)(SP), B4
    624 		MOVOU (8*16 + 5*16)(SP), B5
    625 		MOVOU (8*16 + 6*16)(SP), B6
    626 		MOVOU (8*16 + 7*16)(SP), B7
    627 
    628 		MOVOU (16*0)(SP), T0
    629 		PSHUFD $78, T0, T1
    630 		PXOR T0, T1
    631 
    632 		MOVOU (16*0)(pTbl), ACC0
    633 		MOVOU (16*1)(pTbl), ACCM
    634 		MOVOU ACC0, ACC1
    635 
    636 		PCLMULQDQ $0x00, T1, ACCM
    637 		PCLMULQDQ $0x00, T0, ACC0
    638 		PCLMULQDQ $0x11, T0, ACC1
    639 
    640 		combinedRound(1)
    641 		increment(0)
    642 		combinedRound(2)
    643 		increment(1)
    644 		combinedRound(3)
    645 		increment(2)
    646 		combinedRound(4)
    647 		increment(3)
    648 		combinedRound(5)
    649 		increment(4)
    650 		combinedRound(6)
    651 		increment(5)
    652 		combinedRound(7)
    653 		increment(6)
    654 
    655 		aesRound(8)
    656 		increment(7)
    657 
    658 		PXOR ACC0, ACCM
    659 		PXOR ACC1, ACCM
    660 		MOVOU ACCM, T0
    661 		PSRLDQ $8, ACCM
    662 		PSLLDQ $8, T0
    663 		PXOR ACCM, ACC1
    664 		PXOR T0, ACC0
    665 
    666 		reduceRound(ACC0)
    667 		aesRound(9)
    668 
    669 		reduceRound(ACC0)
    670 		PXOR ACC1, ACC0
    671 
    672 		MOVOU (16*10)(ks), T0
    673 		CMPQ NR, $12
    674 		JB encLast2
    675 		aesRnd(T0)
    676 		aesRound(11)
    677 		MOVOU (16*12)(ks), T0
    678 		JE encLast2
    679 		aesRnd(T0)
    680 		aesRound(13)
    681 		MOVOU (16*14)(ks), T0
    682 encLast2:
    683 		aesRndLast(T0)
    684 
    685 		MOVOU (16*0)(ptx), T0
    686 		PXOR T0, B0
    687 		MOVOU (16*1)(ptx), T0
    688 		PXOR T0, B1
    689 		MOVOU (16*2)(ptx), T0
    690 		PXOR T0, B2
    691 		MOVOU (16*3)(ptx), T0
    692 		PXOR T0, B3
    693 		MOVOU (16*4)(ptx), T0
    694 		PXOR T0, B4
    695 		MOVOU (16*5)(ptx), T0
    696 		PXOR T0, B5
    697 		MOVOU (16*6)(ptx), T0
    698 		PXOR T0, B6
    699 		MOVOU (16*7)(ptx), T0
    700 		PXOR T0, B7
    701 
    702 		MOVOU B0, (16*0)(ctx)
    703 		PSHUFB BSWAP, B0
    704 		PXOR ACC0, B0
    705 		MOVOU B1, (16*1)(ctx)
    706 		PSHUFB BSWAP, B1
    707 		MOVOU B2, (16*2)(ctx)
    708 		PSHUFB BSWAP, B2
    709 		MOVOU B3, (16*3)(ctx)
    710 		PSHUFB BSWAP, B3
    711 		MOVOU B4, (16*4)(ctx)
    712 		PSHUFB BSWAP, B4
    713 		MOVOU B5, (16*5)(ctx)
    714 		PSHUFB BSWAP, B5
    715 		MOVOU B6, (16*6)(ctx)
    716 		PSHUFB BSWAP, B6
    717 		MOVOU B7, (16*7)(ctx)
    718 		PSHUFB BSWAP, B7
    719 
    720 		MOVOU B0, (16*0)(SP)
    721 		MOVOU B1, (16*1)(SP)
    722 		MOVOU B2, (16*2)(SP)
    723 		MOVOU B3, (16*3)(SP)
    724 		MOVOU B4, (16*4)(SP)
    725 		MOVOU B5, (16*5)(SP)
    726 		MOVOU B6, (16*6)(SP)
    727 		MOVOU B7, (16*7)(SP)
    728 
    729 		LEAQ 128(ptx), ptx
    730 		LEAQ 128(ctx), ctx
    731 
    732 		JMP gcmAesEncOctetsLoop
    733 
    734 gcmAesEncOctetsEnd:
    735 
    736 	MOVOU (16*0)(SP), T0
    737 	MOVOU (16*0)(pTbl), ACC0
    738 	MOVOU (16*1)(pTbl), ACCM
    739 	MOVOU ACC0, ACC1
    740 	PSHUFD $78, T0, T1
    741 	PXOR T0, T1
    742 	PCLMULQDQ $0x00, T0, ACC0
    743 	PCLMULQDQ $0x11, T0, ACC1
    744 	PCLMULQDQ $0x00, T1, ACCM
    745 
    746 	mulRound(1)
    747 	mulRound(2)
    748 	mulRound(3)
    749 	mulRound(4)
    750 	mulRound(5)
    751 	mulRound(6)
    752 	mulRound(7)
    753 
    754 	PXOR ACC0, ACCM
    755 	PXOR ACC1, ACCM
    756 	MOVOU ACCM, T0
    757 	PSRLDQ $8, ACCM
    758 	PSLLDQ $8, T0
    759 	PXOR ACCM, ACC1
    760 	PXOR T0, ACC0
    761 
    762 	reduceRound(ACC0)
    763 	reduceRound(ACC0)
    764 	PXOR ACC1, ACC0
    765 
    766 	TESTQ ptxLen, ptxLen
    767 	JE gcmAesEncDone
    768 
    769 	SUBQ $7, aluCTR
    770 
    771 gcmAesEncSingles:
    772 
    773 	MOVOU (16*1)(ks), B1
    774 	MOVOU (16*2)(ks), B2
    775 	MOVOU (16*3)(ks), B3
    776 	MOVOU (16*4)(ks), B4
    777 	MOVOU (16*5)(ks), B5
    778 	MOVOU (16*6)(ks), B6
    779 	MOVOU (16*7)(ks), B7
    780 
    781 	MOVOU (16*14)(pTbl), T2
    782 
    783 gcmAesEncSinglesLoop:
    784 
    785 		CMPQ ptxLen, $16
    786 		JB gcmAesEncTail
    787 		SUBQ $16, ptxLen
    788 
    789 		MOVOU (8*16 + 0*16)(SP), B0
    790 		increment(0)
    791 
    792 		AESENC B1, B0
    793 		AESENC B2, B0
    794 		AESENC B3, B0
    795 		AESENC B4, B0
    796 		AESENC B5, B0
    797 		AESENC B6, B0
    798 		AESENC B7, B0
    799 		MOVOU (16*8)(ks), T0
    800 		AESENC T0, B0
    801 		MOVOU (16*9)(ks), T0
    802 		AESENC T0, B0
    803 		MOVOU (16*10)(ks), T0
    804 		CMPQ NR, $12
    805 		JB encLast3
    806 		AESENC T0, B0
    807 		MOVOU (16*11)(ks), T0
    808 		AESENC T0, B0
    809 		MOVOU (16*12)(ks), T0
    810 		JE encLast3
    811 		AESENC T0, B0
    812 		MOVOU (16*13)(ks), T0
    813 		AESENC T0, B0
    814 		MOVOU (16*14)(ks), T0
    815 encLast3:
    816 		AESENCLAST T0, B0
    817 
    818 		MOVOU (ptx), T0
    819 		PXOR T0, B0
    820 		MOVOU B0, (ctx)
    821 
    822 		PSHUFB BSWAP, B0
    823 		PXOR ACC0, B0
    824 
    825 		MOVOU T2, ACC0
    826 		MOVOU T2, ACC1
    827 		MOVOU (16*15)(pTbl), ACCM
    828 
    829 		PSHUFD $78, B0, T0
    830 		PXOR B0, T0
    831 		PCLMULQDQ $0x00, B0, ACC0
    832 		PCLMULQDQ $0x11, B0, ACC1
    833 		PCLMULQDQ $0x00, T0, ACCM
    834 
    835 		PXOR ACC0, ACCM
    836 		PXOR ACC1, ACCM
    837 		MOVOU ACCM, T0
    838 		PSRLDQ $8, ACCM
    839 		PSLLDQ $8, T0
    840 		PXOR ACCM, ACC1
    841 		PXOR T0, ACC0
    842 
    843 		reduceRound(ACC0)
    844 		reduceRound(ACC0)
    845 		PXOR ACC1, ACC0
    846 
    847 		LEAQ (16*1)(ptx), ptx
    848 		LEAQ (16*1)(ctx), ctx
    849 
    850 	JMP gcmAesEncSinglesLoop
    851 
    852 gcmAesEncTail:
    853 	TESTQ ptxLen, ptxLen
    854 	JE gcmAesEncDone
    855 
    856 	MOVOU (8*16 + 0*16)(SP), B0
    857 	AESENC B1, B0
    858 	AESENC B2, B0
    859 	AESENC B3, B0
    860 	AESENC B4, B0
    861 	AESENC B5, B0
    862 	AESENC B6, B0
    863 	AESENC B7, B0
    864 	MOVOU (16*8)(ks), T0
    865 	AESENC T0, B0
    866 	MOVOU (16*9)(ks), T0
    867 	AESENC T0, B0
    868 	MOVOU (16*10)(ks), T0
    869 	CMPQ NR, $12
    870 	JB encLast4
    871 	AESENC T0, B0
    872 	MOVOU (16*11)(ks), T0
    873 	AESENC T0, B0
    874 	MOVOU (16*12)(ks), T0
    875 	JE encLast4
    876 	AESENC T0, B0
    877 	MOVOU (16*13)(ks), T0
    878 	AESENC T0, B0
    879 	MOVOU (16*14)(ks), T0
    880 encLast4:
    881 	AESENCLAST T0, B0
    882 	MOVOU B0, T0
    883 
    884 	LEAQ -1(ptx)(ptxLen*1), ptx
    885 
    886 	MOVQ ptxLen, aluTMP
    887 	SHLQ $4, aluTMP
    888 
    889 	LEAQ andMask<>(SB), aluCTR
    890 	MOVOU -16(aluCTR)(aluTMP*1), T1
    891 
    892 	PXOR B0, B0
    893 ptxLoadLoop:
    894 		PSLLDQ $1, B0
    895 		PINSRB $0, (ptx), B0
    896 		LEAQ -1(ptx), ptx
    897 		DECQ ptxLen
    898 	JNE ptxLoadLoop
    899 
    900 	PXOR T0, B0
    901 	PAND T1, B0
    902 	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
    903 
    904 	PSHUFB BSWAP, B0
    905 	PXOR ACC0, B0
    906 
    907 	MOVOU T2, ACC0
    908 	MOVOU T2, ACC1
    909 	MOVOU (16*15)(pTbl), ACCM
    910 
    911 	PSHUFD $78, B0, T0
    912 	PXOR B0, T0
    913 	PCLMULQDQ $0x00, B0, ACC0
    914 	PCLMULQDQ $0x11, B0, ACC1
    915 	PCLMULQDQ $0x00, T0, ACCM
    916 
    917 	PXOR ACC0, ACCM
    918 	PXOR ACC1, ACCM
    919 	MOVOU ACCM, T0
    920 	PSRLDQ $8, ACCM
    921 	PSLLDQ $8, T0
    922 	PXOR ACCM, ACC1
    923 	PXOR T0, ACC0
    924 
    925 	reduceRound(ACC0)
    926 	reduceRound(ACC0)
    927 	PXOR ACC1, ACC0
    928 
    929 gcmAesEncDone:
    930 	MOVOU ACC0, (tPtr)
    931 	RET
    932 #undef increment
    933 
    934 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
    935 TEXT gcmAesDec(SB),0,$128-96
    936 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
    937 #define combinedDecRound(i) \
    938 	MOVOU (16*i)(ks), T0;\
    939 	AESENC T0, B0;\
    940 	AESENC T0, B1;\
    941 	AESENC T0, B2;\
    942 	AESENC T0, B3;\
    943 	MOVOU (16*(i*2))(pTbl), T1;\
    944 	MOVOU T1, T2;\
    945 	AESENC T0, B4;\
    946 	AESENC T0, B5;\
    947 	AESENC T0, B6;\
    948 	AESENC T0, B7;\
    949 	MOVOU (16*i)(ctx), T0;\
    950 	PSHUFB BSWAP, T0;\
    951 	PCLMULQDQ $0x00, T0, T1;\
    952 	PXOR T1, ACC0;\
    953 	PSHUFD $78, T0, T1;\
    954 	PCLMULQDQ $0x11, T0, T2;\
    955 	PXOR T1, T0;\
    956 	PXOR T2, ACC1;\
    957 	MOVOU (16*(i*2+1))(pTbl), T2;\
    958 	PCLMULQDQ $0x00, T2, T0;\
    959 	PXOR T0, ACCM
    960 
    961 	MOVQ productTable+0(FP), pTbl
    962 	MOVQ dst+8(FP), ptx
    963 	MOVQ src_base+32(FP), ctx
    964 	MOVQ src_len+40(FP), ptxLen
    965 	MOVQ ctr+56(FP), ctrPtr
    966 	MOVQ T+64(FP), tPtr
    967 	MOVQ ks_base+72(FP), ks
    968 	MOVQ ks_len+80(FP), NR
    969 
    970 	SHRQ $2, NR
    971 	DECQ NR
    972 
    973 	MOVOU bswapMask<>(SB), BSWAP
    974 	MOVOU gcmPoly<>(SB), POLY
    975 
    976 	MOVOU (tPtr), ACC0
    977 	PXOR ACC1, ACC1
    978 	PXOR ACCM, ACCM
    979 	MOVOU (ctrPtr), B0
    980 	MOVL (3*4)(ctrPtr), aluCTR
    981 	MOVOU (ks), T0
    982 	MOVL (3*4)(ks), aluK
    983 	BSWAPL aluCTR
    984 	BSWAPL aluK
    985 
    986 	PXOR B0, T0
    987 	MOVOU T0, (0*16)(SP)
    988 	increment(0)
    989 
    990 	CMPQ ptxLen, $128
    991 	JB gcmAesDecSingles
    992 
    993 	MOVOU T0, (1*16)(SP)
    994 	increment(1)
    995 	MOVOU T0, (2*16)(SP)
    996 	increment(2)
    997 	MOVOU T0, (3*16)(SP)
    998 	increment(3)
    999 	MOVOU T0, (4*16)(SP)
   1000 	increment(4)
   1001 	MOVOU T0, (5*16)(SP)
   1002 	increment(5)
   1003 	MOVOU T0, (6*16)(SP)
   1004 	increment(6)
   1005 	MOVOU T0, (7*16)(SP)
   1006 	increment(7)
   1007 
   1008 gcmAesDecOctetsLoop:
   1009 
   1010 		CMPQ ptxLen, $128
   1011 		JB gcmAesDecEndOctets
   1012 		SUBQ $128, ptxLen
   1013 
   1014 		MOVOU (0*16)(SP), B0
   1015 		MOVOU (1*16)(SP), B1
   1016 		MOVOU (2*16)(SP), B2
   1017 		MOVOU (3*16)(SP), B3
   1018 		MOVOU (4*16)(SP), B4
   1019 		MOVOU (5*16)(SP), B5
   1020 		MOVOU (6*16)(SP), B6
   1021 		MOVOU (7*16)(SP), B7
   1022 
   1023 		MOVOU (16*0)(ctx), T0
   1024 		PSHUFB BSWAP, T0
   1025 		PXOR ACC0, T0
   1026 		PSHUFD $78, T0, T1
   1027 		PXOR T0, T1
   1028 
   1029 		MOVOU (16*0)(pTbl), ACC0
   1030 		MOVOU (16*1)(pTbl), ACCM
   1031 		MOVOU ACC0, ACC1
   1032 
   1033 		PCLMULQDQ $0x00, T1, ACCM
   1034 		PCLMULQDQ $0x00, T0, ACC0
   1035 		PCLMULQDQ $0x11, T0, ACC1
   1036 
   1037 		combinedDecRound(1)
   1038 		increment(0)
   1039 		combinedDecRound(2)
   1040 		increment(1)
   1041 		combinedDecRound(3)
   1042 		increment(2)
   1043 		combinedDecRound(4)
   1044 		increment(3)
   1045 		combinedDecRound(5)
   1046 		increment(4)
   1047 		combinedDecRound(6)
   1048 		increment(5)
   1049 		combinedDecRound(7)
   1050 		increment(6)
   1051 
   1052 		aesRound(8)
   1053 		increment(7)
   1054 
   1055 		PXOR ACC0, ACCM
   1056 		PXOR ACC1, ACCM
   1057 		MOVOU ACCM, T0
   1058 		PSRLDQ $8, ACCM
   1059 		PSLLDQ $8, T0
   1060 		PXOR ACCM, ACC1
   1061 		PXOR T0, ACC0
   1062 
   1063 		reduceRound(ACC0)
   1064 		aesRound(9)
   1065 
   1066 		reduceRound(ACC0)
   1067 		PXOR ACC1, ACC0
   1068 
   1069 		MOVOU (16*10)(ks), T0
   1070 		CMPQ NR, $12
   1071 		JB decLast1
   1072 		aesRnd(T0)
   1073 		aesRound(11)
   1074 		MOVOU (16*12)(ks), T0
   1075 		JE decLast1
   1076 		aesRnd(T0)
   1077 		aesRound(13)
   1078 		MOVOU (16*14)(ks), T0
   1079 decLast1:
   1080 		aesRndLast(T0)
   1081 
   1082 		MOVOU (16*0)(ctx), T0
   1083 		PXOR T0, B0
   1084 		MOVOU (16*1)(ctx), T0
   1085 		PXOR T0, B1
   1086 		MOVOU (16*2)(ctx), T0
   1087 		PXOR T0, B2
   1088 		MOVOU (16*3)(ctx), T0
   1089 		PXOR T0, B3
   1090 		MOVOU (16*4)(ctx), T0
   1091 		PXOR T0, B4
   1092 		MOVOU (16*5)(ctx), T0
   1093 		PXOR T0, B5
   1094 		MOVOU (16*6)(ctx), T0
   1095 		PXOR T0, B6
   1096 		MOVOU (16*7)(ctx), T0
   1097 		PXOR T0, B7
   1098 
   1099 		MOVOU B0, (16*0)(ptx)
   1100 		MOVOU B1, (16*1)(ptx)
   1101 		MOVOU B2, (16*2)(ptx)
   1102 		MOVOU B3, (16*3)(ptx)
   1103 		MOVOU B4, (16*4)(ptx)
   1104 		MOVOU B5, (16*5)(ptx)
   1105 		MOVOU B6, (16*6)(ptx)
   1106 		MOVOU B7, (16*7)(ptx)
   1107 
   1108 		LEAQ 128(ptx), ptx
   1109 		LEAQ 128(ctx), ctx
   1110 
   1111 		JMP gcmAesDecOctetsLoop
   1112 
   1113 gcmAesDecEndOctets:
   1114 
   1115 	SUBQ $7, aluCTR
   1116 
   1117 gcmAesDecSingles:
   1118 
   1119 	MOVOU (16*1)(ks), B1
   1120 	MOVOU (16*2)(ks), B2
   1121 	MOVOU (16*3)(ks), B3
   1122 	MOVOU (16*4)(ks), B4
   1123 	MOVOU (16*5)(ks), B5
   1124 	MOVOU (16*6)(ks), B6
   1125 	MOVOU (16*7)(ks), B7
   1126 
   1127 	MOVOU (16*14)(pTbl), T2
   1128 
   1129 gcmAesDecSinglesLoop:
   1130 
   1131 		CMPQ ptxLen, $16
   1132 		JB gcmAesDecTail
   1133 		SUBQ $16, ptxLen
   1134 
   1135 		MOVOU (ctx), B0
   1136 		MOVOU B0, T1
   1137 		PSHUFB BSWAP, B0
   1138 		PXOR ACC0, B0
   1139 
   1140 		MOVOU T2, ACC0
   1141 		MOVOU T2, ACC1
   1142 		MOVOU (16*15)(pTbl), ACCM
   1143 
   1144 		PCLMULQDQ $0x00, B0, ACC0
   1145 		PCLMULQDQ $0x11, B0, ACC1
   1146 		PSHUFD $78, B0, T0
   1147 		PXOR B0, T0
   1148 		PCLMULQDQ $0x00, T0, ACCM
   1149 
   1150 		PXOR ACC0, ACCM
   1151 		PXOR ACC1, ACCM
   1152 		MOVOU ACCM, T0
   1153 		PSRLDQ $8, ACCM
   1154 		PSLLDQ $8, T0
   1155 		PXOR ACCM, ACC1
   1156 		PXOR T0, ACC0
   1157 
   1158 		reduceRound(ACC0)
   1159 		reduceRound(ACC0)
   1160 		PXOR ACC1, ACC0
   1161 
   1162 		MOVOU (0*16)(SP), B0
   1163 		increment(0)
   1164 		AESENC B1, B0
   1165 		AESENC B2, B0
   1166 		AESENC B3, B0
   1167 		AESENC B4, B0
   1168 		AESENC B5, B0
   1169 		AESENC B6, B0
   1170 		AESENC B7, B0
   1171 		MOVOU (16*8)(ks), T0
   1172 		AESENC T0, B0
   1173 		MOVOU (16*9)(ks), T0
   1174 		AESENC T0, B0
   1175 		MOVOU (16*10)(ks), T0
   1176 		CMPQ NR, $12
   1177 		JB decLast2
   1178 		AESENC T0, B0
   1179 		MOVOU (16*11)(ks), T0
   1180 		AESENC T0, B0
   1181 		MOVOU (16*12)(ks), T0
   1182 		JE decLast2
   1183 		AESENC T0, B0
   1184 		MOVOU (16*13)(ks), T0
   1185 		AESENC T0, B0
   1186 		MOVOU (16*14)(ks), T0
   1187 decLast2:
   1188 		AESENCLAST T0, B0
   1189 
   1190 		PXOR T1, B0
   1191 		MOVOU B0, (ptx)
   1192 
   1193 		LEAQ (16*1)(ptx), ptx
   1194 		LEAQ (16*1)(ctx), ctx
   1195 
   1196 	JMP gcmAesDecSinglesLoop
   1197 
   1198 gcmAesDecTail:
   1199 
   1200 	TESTQ ptxLen, ptxLen
   1201 	JE gcmAesDecDone
   1202 
   1203 	MOVQ ptxLen, aluTMP
   1204 	SHLQ $4, aluTMP
   1205 	LEAQ andMask<>(SB), aluCTR
   1206 	MOVOU -16(aluCTR)(aluTMP*1), T1
   1207 
   1208 	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
   1209 	PAND T1, B0
   1210 
   1211 	MOVOU B0, T1
   1212 	PSHUFB BSWAP, B0
   1213 	PXOR ACC0, B0
   1214 
   1215 	MOVOU (16*14)(pTbl), ACC0
   1216 	MOVOU (16*15)(pTbl), ACCM
   1217 	MOVOU ACC0, ACC1
   1218 
   1219 	PCLMULQDQ $0x00, B0, ACC0
   1220 	PCLMULQDQ $0x11, B0, ACC1
   1221 	PSHUFD $78, B0, T0
   1222 	PXOR B0, T0
   1223 	PCLMULQDQ $0x00, T0, ACCM
   1224 
   1225 	PXOR ACC0, ACCM
   1226 	PXOR ACC1, ACCM
   1227 	MOVOU ACCM, T0
   1228 	PSRLDQ $8, ACCM
   1229 	PSLLDQ $8, T0
   1230 	PXOR ACCM, ACC1
   1231 	PXOR T0, ACC0
   1232 
   1233 	reduceRound(ACC0)
   1234 	reduceRound(ACC0)
   1235 	PXOR ACC1, ACC0
   1236 
   1237 	MOVOU (0*16)(SP), B0
   1238 	increment(0)
   1239 	AESENC B1, B0
   1240 	AESENC B2, B0
   1241 	AESENC B3, B0
   1242 	AESENC B4, B0
   1243 	AESENC B5, B0
   1244 	AESENC B6, B0
   1245 	AESENC B7, B0
   1246 	MOVOU (16*8)(ks), T0
   1247 	AESENC T0, B0
   1248 	MOVOU (16*9)(ks), T0
   1249 	AESENC T0, B0
   1250 	MOVOU (16*10)(ks), T0
   1251 	CMPQ NR, $12
   1252 	JB decLast3
   1253 	AESENC T0, B0
   1254 	MOVOU (16*11)(ks), T0
   1255 	AESENC T0, B0
   1256 	MOVOU (16*12)(ks), T0
   1257 	JE decLast3
   1258 	AESENC T0, B0
   1259 	MOVOU (16*13)(ks), T0
   1260 	AESENC T0, B0
   1261 	MOVOU (16*14)(ks), T0
   1262 decLast3:
   1263 	AESENCLAST T0, B0
   1264 	PXOR T1, B0
   1265 
   1266 ptxStoreLoop:
   1267 		PEXTRB $0, B0, (ptx)
   1268 		PSRLDQ $1, B0
   1269 		LEAQ 1(ptx), ptx
   1270 		DECQ ptxLen
   1271 
   1272 	JNE ptxStoreLoop
   1273 
   1274 gcmAesDecDone:
   1275 
   1276 	MOVOU ACC0, (tPtr)
   1277 	RET
   1278