Home | History | Annotate | Download | only in modes
      1 /* ====================================================================
      2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  *
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in
     13  *    the documentation and/or other materials provided with the
     14  *    distribution.
     15  *
     16  * 3. All advertising materials mentioning features or use of this
     17  *    software must display the following acknowledgment:
     18  *    "This product includes software developed by the OpenSSL Project
     19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
     20  *
     21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
     22  *    endorse or promote products derived from this software without
     23  *    prior written permission. For written permission, please contact
     24  *    openssl-core (at) openssl.org.
     25  *
     26  * 5. Products derived from this software may not be called "OpenSSL"
     27  *    nor may "OpenSSL" appear in their names without prior written
     28  *    permission of the OpenSSL Project.
     29  *
     30  * 6. Redistributions of any form whatsoever must retain the following
     31  *    acknowledgment:
     32  *    "This product includes software developed by the OpenSSL Project
     33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
     34  *
     35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
     36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
     39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
     44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
     46  * OF THE POSSIBILITY OF SUCH DAMAGE.
     47  * ====================================================================
     48  */
     49 
     50 #define OPENSSL_FIPSAPI
     51 
     52 #include <openssl/crypto.h>
     53 #include "modes_lcl.h"
     54 #include <string.h>
     55 
     56 #ifndef MODES_DEBUG
     57 # ifndef NDEBUG
     58 #  define NDEBUG
     59 # endif
     60 #endif
     61 #include <assert.h>
     62 
     63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
     64 /* redefine, because alignment is ensured */
     65 #undef	GETU32
     66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
     67 #undef	PUTU32
     68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
     69 #endif
     70 
     71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
     72 #define REDUCE1BIT(V)	do { \
     73 	if (sizeof(size_t)==8) { \
     74 		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
     75 		V.lo  = (V.hi<<63)|(V.lo>>1); \
     76 		V.hi  = (V.hi>>1 )^T; \
     77 	} \
     78 	else { \
     79 		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
     80 		V.lo  = (V.hi<<63)|(V.lo>>1); \
     81 		V.hi  = (V.hi>>1 )^((u64)T<<32); \
     82 	} \
     83 } while(0)
     84 
     85 /*
     86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
     87  * never be set to 8. 8 is effectively reserved for testing purposes.
     88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
     89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
     90  * whole spectrum of possible table driven implementations. Why? In
     91  * non-"Shoup's" case memory access pattern is segmented in such manner,
     92  * that it's trivial to see that cache timing information can reveal
     93  * fair portion of intermediate hash value. Given that ciphertext is
     94  * always available to attacker, it's possible for him to attempt to
     95  * deduce secret parameter H and if successful, tamper with messages
     96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
     97  * not as trivial, but there is no reason to believe that it's resistant
     98  * to cache-timing attack. And the thing about "8-bit" implementation is
     99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
    100  * key + 1KB shared. Well, on pros side it should be twice as fast as
    101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
    102  * was observed to run ~75% faster, closer to 100% for commercial
    103  * compilers... Yet "4-bit" procedure is preferred, because it's
    104  * believed to provide better security-performance balance and adequate
    105  * all-round performance. "All-round" refers to things like:
    106  *
    107  * - shorter setup time effectively improves overall timing for
    108  *   handling short messages;
    109  * - larger table allocation can become unbearable because of VM
    110  *   subsystem penalties (for example on Windows large enough free
    111  *   results in VM working set trimming, meaning that consequent
    112  *   malloc would immediately incur working set expansion);
    113  * - larger table has larger cache footprint, which can affect
    114  *   performance of other code paths (not necessarily even from same
    115  *   thread in Hyper-Threading world);
    116  *
    117  * Value of 1 is not appropriate for performance reasons.
    118  */
    119 #if	TABLE_BITS==8
    120 
    121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
    122 {
    123 	int  i, j;
    124 	u128 V;
    125 
    126 	Htable[0].hi = 0;
    127 	Htable[0].lo = 0;
    128 	V.hi = H[0];
    129 	V.lo = H[1];
    130 
    131 	for (Htable[128]=V, i=64; i>0; i>>=1) {
    132 		REDUCE1BIT(V);
    133 		Htable[i] = V;
    134 	}
    135 
    136 	for (i=2; i<256; i<<=1) {
    137 		u128 *Hi = Htable+i, H0 = *Hi;
    138 		for (j=1; j<i; ++j) {
    139 			Hi[j].hi = H0.hi^Htable[j].hi;
    140 			Hi[j].lo = H0.lo^Htable[j].lo;
    141 		}
    142 	}
    143 }
    144 
    145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
    146 {
    147 	u128 Z = { 0, 0};
    148 	const u8 *xi = (const u8 *)Xi+15;
    149 	size_t rem, n = *xi;
    150 	const union { long one; char little; } is_endian = {1};
    151 	static const size_t rem_8bit[256] = {
    152 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
    153 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
    154 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
    155 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
    156 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
    157 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
    158 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
    159 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
    160 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
    161 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
    162 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
    163 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
    164 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
    165 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
    166 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
    167 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
    168 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
    169 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
    170 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
    171 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
    172 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
    173 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
    174 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
    175 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
    176 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
    177 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
    178 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
    179 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
    180 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
    181 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
    182 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
    183 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
    184 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
    185 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
    186 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
    187 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
    188 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
    189 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
    190 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
    191 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
    192 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
    193 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
    194 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
    195 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
    196 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
    197 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
    198 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
    199 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
    200 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
    201 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
    202 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
    203 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
    204 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
    205 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
    206 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
    207 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
    208 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
    209 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
    210 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
    211 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
    212 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
    213 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
    214 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
    215 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
    216 
    217 	while (1) {
    218 		Z.hi ^= Htable[n].hi;
    219 		Z.lo ^= Htable[n].lo;
    220 
    221 		if ((u8 *)Xi==xi)	break;
    222 
    223 		n = *(--xi);
    224 
    225 		rem  = (size_t)Z.lo&0xff;
    226 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
    227 		Z.hi = (Z.hi>>8);
    228 		if (sizeof(size_t)==8)
    229 			Z.hi ^= rem_8bit[rem];
    230 		else
    231 			Z.hi ^= (u64)rem_8bit[rem]<<32;
    232 	}
    233 
    234 	if (is_endian.little) {
    235 #ifdef BSWAP8
    236 		Xi[0] = BSWAP8(Z.hi);
    237 		Xi[1] = BSWAP8(Z.lo);
    238 #else
    239 		u8 *p = (u8 *)Xi;
    240 		u32 v;
    241 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
    242 		v = (u32)(Z.hi);	PUTU32(p+4,v);
    243 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
    244 		v = (u32)(Z.lo);	PUTU32(p+12,v);
    245 #endif
    246 	}
    247 	else {
    248 		Xi[0] = Z.hi;
    249 		Xi[1] = Z.lo;
    250 	}
    251 }
    252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
    253 
    254 #elif	TABLE_BITS==4
    255 
    256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
    257 {
    258 	u128 V;
    259 #if defined(OPENSSL_SMALL_FOOTPRINT)
    260 	int  i;
    261 #endif
    262 
    263 	Htable[0].hi = 0;
    264 	Htable[0].lo = 0;
    265 	V.hi = H[0];
    266 	V.lo = H[1];
    267 
    268 #if defined(OPENSSL_SMALL_FOOTPRINT)
    269 	for (Htable[8]=V, i=4; i>0; i>>=1) {
    270 		REDUCE1BIT(V);
    271 		Htable[i] = V;
    272 	}
    273 
    274 	for (i=2; i<16; i<<=1) {
    275 		u128 *Hi = Htable+i;
    276 		int   j;
    277 		for (V=*Hi, j=1; j<i; ++j) {
    278 			Hi[j].hi = V.hi^Htable[j].hi;
    279 			Hi[j].lo = V.lo^Htable[j].lo;
    280 		}
    281 	}
    282 #else
    283 	Htable[8] = V;
    284 	REDUCE1BIT(V);
    285 	Htable[4] = V;
    286 	REDUCE1BIT(V);
    287 	Htable[2] = V;
    288 	REDUCE1BIT(V);
    289 	Htable[1] = V;
    290 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
    291 	V=Htable[4];
    292 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
    293 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
    294 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
    295 	V=Htable[8];
    296 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
    297 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
    298 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
    299 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
    300 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
    301 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
    302 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
    303 #endif
    304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
    305 	/*
    306 	 * ARM assembler expects specific dword order in Htable.
    307 	 */
    308 	{
    309 	int j;
    310 	const union { long one; char little; } is_endian = {1};
    311 
    312 	if (is_endian.little)
    313 		for (j=0;j<16;++j) {
    314 			V = Htable[j];
    315 			Htable[j].hi = V.lo;
    316 			Htable[j].lo = V.hi;
    317 		}
    318 	else
    319 		for (j=0;j<16;++j) {
    320 			V = Htable[j];
    321 			Htable[j].hi = V.lo<<32|V.lo>>32;
    322 			Htable[j].lo = V.hi<<32|V.hi>>32;
    323 		}
    324 	}
    325 #endif
    326 }
    327 
    328 #ifndef GHASH_ASM
    329 static const size_t rem_4bit[16] = {
    330 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
    331 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
    332 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
    333 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
    334 
    335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
    336 {
    337 	u128 Z;
    338 	int cnt = 15;
    339 	size_t rem, nlo, nhi;
    340 	const union { long one; char little; } is_endian = {1};
    341 
    342 	nlo  = ((const u8 *)Xi)[15];
    343 	nhi  = nlo>>4;
    344 	nlo &= 0xf;
    345 
    346 	Z.hi = Htable[nlo].hi;
    347 	Z.lo = Htable[nlo].lo;
    348 
    349 	while (1) {
    350 		rem  = (size_t)Z.lo&0xf;
    351 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
    352 		Z.hi = (Z.hi>>4);
    353 		if (sizeof(size_t)==8)
    354 			Z.hi ^= rem_4bit[rem];
    355 		else
    356 			Z.hi ^= (u64)rem_4bit[rem]<<32;
    357 
    358 		Z.hi ^= Htable[nhi].hi;
    359 		Z.lo ^= Htable[nhi].lo;
    360 
    361 		if (--cnt<0)		break;
    362 
    363 		nlo  = ((const u8 *)Xi)[cnt];
    364 		nhi  = nlo>>4;
    365 		nlo &= 0xf;
    366 
    367 		rem  = (size_t)Z.lo&0xf;
    368 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
    369 		Z.hi = (Z.hi>>4);
    370 		if (sizeof(size_t)==8)
    371 			Z.hi ^= rem_4bit[rem];
    372 		else
    373 			Z.hi ^= (u64)rem_4bit[rem]<<32;
    374 
    375 		Z.hi ^= Htable[nlo].hi;
    376 		Z.lo ^= Htable[nlo].lo;
    377 	}
    378 
    379 	if (is_endian.little) {
    380 #ifdef BSWAP8
    381 		Xi[0] = BSWAP8(Z.hi);
    382 		Xi[1] = BSWAP8(Z.lo);
    383 #else
    384 		u8 *p = (u8 *)Xi;
    385 		u32 v;
    386 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
    387 		v = (u32)(Z.hi);	PUTU32(p+4,v);
    388 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
    389 		v = (u32)(Z.lo);	PUTU32(p+12,v);
    390 #endif
    391 	}
    392 	else {
    393 		Xi[0] = Z.hi;
    394 		Xi[1] = Z.lo;
    395 	}
    396 }
    397 
    398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
    399 /*
    400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
    401  * details... Compiler-generated code doesn't seem to give any
    402  * performance improvement, at least not on x86[_64]. It's here
    403  * mostly as reference and a placeholder for possible future
    404  * non-trivial optimization[s]...
    405  */
    406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
    407 				const u8 *inp,size_t len)
    408 {
    409     u128 Z;
    410     int cnt;
    411     size_t rem, nlo, nhi;
    412     const union { long one; char little; } is_endian = {1};
    413 
    414 #if 1
    415     do {
    416 	cnt  = 15;
    417 	nlo  = ((const u8 *)Xi)[15];
    418 	nlo ^= inp[15];
    419 	nhi  = nlo>>4;
    420 	nlo &= 0xf;
    421 
    422 	Z.hi = Htable[nlo].hi;
    423 	Z.lo = Htable[nlo].lo;
    424 
    425 	while (1) {
    426 		rem  = (size_t)Z.lo&0xf;
    427 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
    428 		Z.hi = (Z.hi>>4);
    429 		if (sizeof(size_t)==8)
    430 			Z.hi ^= rem_4bit[rem];
    431 		else
    432 			Z.hi ^= (u64)rem_4bit[rem]<<32;
    433 
    434 		Z.hi ^= Htable[nhi].hi;
    435 		Z.lo ^= Htable[nhi].lo;
    436 
    437 		if (--cnt<0)		break;
    438 
    439 		nlo  = ((const u8 *)Xi)[cnt];
    440 		nlo ^= inp[cnt];
    441 		nhi  = nlo>>4;
    442 		nlo &= 0xf;
    443 
    444 		rem  = (size_t)Z.lo&0xf;
    445 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
    446 		Z.hi = (Z.hi>>4);
    447 		if (sizeof(size_t)==8)
    448 			Z.hi ^= rem_4bit[rem];
    449 		else
    450 			Z.hi ^= (u64)rem_4bit[rem]<<32;
    451 
    452 		Z.hi ^= Htable[nlo].hi;
    453 		Z.lo ^= Htable[nlo].lo;
    454 	}
    455 #else
    456     /*
    457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
    458      * [should] give ~50% improvement... One could have PACK()-ed
    459      * the rem_8bit even here, but the priority is to minimize
    460      * cache footprint...
    461      */
    462     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
    463     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
    464     static const unsigned short rem_8bit[256] = {
    465 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
    466 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
    467 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
    468 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
    469 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
    470 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
    471 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
    472 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
    473 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
    474 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
    475 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
    476 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
    477 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
    478 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
    479 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
    480 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
    481 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
    482 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
    483 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
    484 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
    485 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
    486 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
    487 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
    488 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
    489 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
    490 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
    491 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
    492 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
    493 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
    494 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
    495 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
    496 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
    497     /*
    498      * This pre-processing phase slows down procedure by approximately
    499      * same time as it makes each loop spin faster. In other words
    500      * single block performance is approximately same as straightforward
    501      * "4-bit" implementation, and then it goes only faster...
    502      */
    503     for (cnt=0; cnt<16; ++cnt) {
    504 	Z.hi = Htable[cnt].hi;
    505 	Z.lo = Htable[cnt].lo;
    506 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
    507 	Hshr4[cnt].hi = (Z.hi>>4);
    508 	Hshl4[cnt]    = (u8)(Z.lo<<4);
    509     }
    510 
    511     do {
    512 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
    513 		nlo  = ((const u8 *)Xi)[cnt];
    514 		nlo ^= inp[cnt];
    515 		nhi  = nlo>>4;
    516 		nlo &= 0xf;
    517 
    518 		Z.hi ^= Htable[nlo].hi;
    519 		Z.lo ^= Htable[nlo].lo;
    520 
    521 		rem = (size_t)Z.lo&0xff;
    522 
    523 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
    524 		Z.hi = (Z.hi>>8);
    525 
    526 		Z.hi ^= Hshr4[nhi].hi;
    527 		Z.lo ^= Hshr4[nhi].lo;
    528 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
    529 	}
    530 
    531 	nlo  = ((const u8 *)Xi)[0];
    532 	nlo ^= inp[0];
    533 	nhi  = nlo>>4;
    534 	nlo &= 0xf;
    535 
    536 	Z.hi ^= Htable[nlo].hi;
    537 	Z.lo ^= Htable[nlo].lo;
    538 
    539 	rem = (size_t)Z.lo&0xf;
    540 
    541 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
    542 	Z.hi = (Z.hi>>4);
    543 
    544 	Z.hi ^= Htable[nhi].hi;
    545 	Z.lo ^= Htable[nhi].lo;
    546 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
    547 #endif
    548 
    549 	if (is_endian.little) {
    550 #ifdef BSWAP8
    551 		Xi[0] = BSWAP8(Z.hi);
    552 		Xi[1] = BSWAP8(Z.lo);
    553 #else
    554 		u8 *p = (u8 *)Xi;
    555 		u32 v;
    556 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
    557 		v = (u32)(Z.hi);	PUTU32(p+4,v);
    558 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
    559 		v = (u32)(Z.lo);	PUTU32(p+12,v);
    560 #endif
    561 	}
    562 	else {
    563 		Xi[0] = Z.hi;
    564 		Xi[1] = Z.lo;
    565 	}
    566     } while (inp+=16, len-=16);
    567 }
    568 #endif
    569 #else
    570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
    571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
    572 #endif
    573 
    574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
    575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
    576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
    577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
    578  * trashing effect. In other words idea is to hash data while it's
    579  * still in L1 cache after encryption pass... */
    580 #define GHASH_CHUNK       (3*1024)
    581 #endif
    582 
    583 #else	/* TABLE_BITS */
    584 
    585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
    586 {
    587 	u128 V,Z = { 0,0 };
    588 	long X;
    589 	int  i,j;
    590 	const long *xi = (const long *)Xi;
    591 	const union { long one; char little; } is_endian = {1};
    592 
    593 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
    594 	V.lo = H[1];
    595 
    596 	for (j=0; j<16/sizeof(long); ++j) {
    597 		if (is_endian.little) {
    598 			if (sizeof(long)==8) {
    599 #ifdef BSWAP8
    600 				X = (long)(BSWAP8(xi[j]));
    601 #else
    602 				const u8 *p = (const u8 *)(xi+j);
    603 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
    604 #endif
    605 			}
    606 			else {
    607 				const u8 *p = (const u8 *)(xi+j);
    608 				X = (long)GETU32(p);
    609 			}
    610 		}
    611 		else
    612 			X = xi[j];
    613 
    614 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
    615 			u64 M = (u64)(X>>(8*sizeof(long)-1));
    616 			Z.hi ^= V.hi&M;
    617 			Z.lo ^= V.lo&M;
    618 
    619 			REDUCE1BIT(V);
    620 		}
    621 	}
    622 
    623 	if (is_endian.little) {
    624 #ifdef BSWAP8
    625 		Xi[0] = BSWAP8(Z.hi);
    626 		Xi[1] = BSWAP8(Z.lo);
    627 #else
    628 		u8 *p = (u8 *)Xi;
    629 		u32 v;
    630 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
    631 		v = (u32)(Z.hi);	PUTU32(p+4,v);
    632 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
    633 		v = (u32)(Z.lo);	PUTU32(p+12,v);
    634 #endif
    635 	}
    636 	else {
    637 		Xi[0] = Z.hi;
    638 		Xi[1] = Z.lo;
    639 	}
    640 }
    641 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
    642 
    643 #endif
    644 
    645 #if	TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
    646 # if	!defined(I386_ONLY) && \
    647 	(defined(__i386)	|| defined(__i386__)	|| \
    648 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
    649 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
    650 #  define GHASH_ASM_X86_OR_64
    651 #  define GCM_FUNCREF_4BIT
    652 extern unsigned int OPENSSL_ia32cap_P[2];
    653 
    654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
    655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
    656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
    657 
    658 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
    659 #   define GHASH_ASM_X86
    660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
    661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
    662 
    663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
    664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
    665 #  endif
    666 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
    667 #  include "arm_arch.h"
    668 #  if __ARM_ARCH__>=7
    669 #   define GHASH_ASM_ARM
    670 #   define GCM_FUNCREF_4BIT
    671 #   define PMULL_CAPABLE	(OPENSSL_armcap_P & ARMV8_PMULL)
    672 #   if defined(__arm__) || defined(__arm)
    673 #    define NEON_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
    674 #   endif
    675 void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
    676 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
    677 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
    678 void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
    679 void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
    680 void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
    681 #  endif
    682 # endif
    683 #endif
    684 
    685 #ifdef GCM_FUNCREF_4BIT
    686 # undef  GCM_MUL
    687 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
    688 # ifdef GHASH
    689 #  undef  GHASH
    690 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
    691 # endif
    692 #endif
    693 
    694 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
    695 {
    696 	const union { long one; char little; } is_endian = {1};
    697 
    698 	memset(ctx,0,sizeof(*ctx));
    699 	ctx->block = block;
    700 	ctx->key   = key;
    701 
    702 	(*block)(ctx->H.c,ctx->H.c,key);
    703 
    704 	if (is_endian.little) {
    705 		/* H is stored in host byte order */
    706 #ifdef BSWAP8
    707 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
    708 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
    709 #else
    710 		u8 *p = ctx->H.c;
    711 		u64 hi,lo;
    712 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
    713 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
    714 		ctx->H.u[0] = hi;
    715 		ctx->H.u[1] = lo;
    716 #endif
    717 	}
    718 
    719 #if	TABLE_BITS==8
    720 	gcm_init_8bit(ctx->Htable,ctx->H.u);
    721 #elif	TABLE_BITS==4
    722 # if	defined(GHASH_ASM_X86_OR_64)
    723 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
    724 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
    725 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
    726 		gcm_init_clmul(ctx->Htable,ctx->H.u);
    727 		ctx->gmult = gcm_gmult_clmul;
    728 		ctx->ghash = gcm_ghash_clmul;
    729 		return;
    730 	}
    731 #  endif
    732 	gcm_init_4bit(ctx->Htable,ctx->H.u);
    733 #  if	defined(GHASH_ASM_X86)			/* x86 only */
    734 #   if	defined(OPENSSL_IA32_SSE2)
    735 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
    736 #   else
    737 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
    738 #   endif
    739 		ctx->gmult = gcm_gmult_4bit_mmx;
    740 		ctx->ghash = gcm_ghash_4bit_mmx;
    741 	} else {
    742 		ctx->gmult = gcm_gmult_4bit_x86;
    743 		ctx->ghash = gcm_ghash_4bit_x86;
    744 	}
    745 #  else
    746 	ctx->gmult = gcm_gmult_4bit;
    747 	ctx->ghash = gcm_ghash_4bit;
    748 #  endif
    749 # elif	defined(GHASH_ASM_ARM)
    750 #  ifdef PMULL_CAPABLE
    751 	if (PMULL_CAPABLE) {
    752 		gcm_init_v8(ctx->Htable,ctx->H.u);
    753 		ctx->gmult = gcm_gmult_v8;
    754 		ctx->ghash = gcm_ghash_v8;
    755 	} else
    756 #  endif
    757 #  ifdef NEON_CAPABLE
    758 	if (NEON_CAPABLE) {
    759 		gcm_init_neon(ctx->Htable,ctx->H.u);
    760 		ctx->gmult = gcm_gmult_neon;
    761 		ctx->ghash = gcm_ghash_neon;
    762 	} else
    763 #  endif
    764 	{
    765 		gcm_init_4bit(ctx->Htable,ctx->H.u);
    766 		ctx->gmult = gcm_gmult_4bit;
    767 		ctx->ghash = gcm_ghash_4bit;
    768 	}
    769 # else
    770 	gcm_init_4bit(ctx->Htable,ctx->H.u);
    771 # endif
    772 #endif
    773 }
    774 
    775 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
    776 {
    777 	const union { long one; char little; } is_endian = {1};
    778 	unsigned int ctr;
    779 #ifdef GCM_FUNCREF_4BIT
    780 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
    781 #endif
    782 
    783 	ctx->Yi.u[0]  = 0;
    784 	ctx->Yi.u[1]  = 0;
    785 	ctx->Xi.u[0]  = 0;
    786 	ctx->Xi.u[1]  = 0;
    787 	ctx->len.u[0] = 0;	/* AAD length */
    788 	ctx->len.u[1] = 0;	/* message length */
    789 	ctx->ares = 0;
    790 	ctx->mres = 0;
    791 
    792 	if (len==12) {
    793 		memcpy(ctx->Yi.c,iv,12);
    794 		ctx->Yi.c[15]=1;
    795 		ctr=1;
    796 	}
    797 	else {
    798 		size_t i;
    799 		u64 len0 = len;
    800 
    801 		while (len>=16) {
    802 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
    803 			GCM_MUL(ctx,Yi);
    804 			iv += 16;
    805 			len -= 16;
    806 		}
    807 		if (len) {
    808 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
    809 			GCM_MUL(ctx,Yi);
    810 		}
    811 		len0 <<= 3;
    812 		if (is_endian.little) {
    813 #ifdef BSWAP8
    814 			ctx->Yi.u[1]  ^= BSWAP8(len0);
    815 #else
    816 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
    817 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
    818 			ctx->Yi.c[10] ^= (u8)(len0>>40);
    819 			ctx->Yi.c[11] ^= (u8)(len0>>32);
    820 			ctx->Yi.c[12] ^= (u8)(len0>>24);
    821 			ctx->Yi.c[13] ^= (u8)(len0>>16);
    822 			ctx->Yi.c[14] ^= (u8)(len0>>8);
    823 			ctx->Yi.c[15] ^= (u8)(len0);
    824 #endif
    825 		}
    826 		else
    827 			ctx->Yi.u[1]  ^= len0;
    828 
    829 		GCM_MUL(ctx,Yi);
    830 
    831 		if (is_endian.little)
    832 #ifdef BSWAP4
    833 			ctr = BSWAP4(ctx->Yi.d[3]);
    834 #else
    835 			ctr = GETU32(ctx->Yi.c+12);
    836 #endif
    837 		else
    838 			ctr = ctx->Yi.d[3];
    839 	}
    840 
    841 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
    842 	++ctr;
    843 	if (is_endian.little)
    844 #ifdef BSWAP4
    845 		ctx->Yi.d[3] = BSWAP4(ctr);
    846 #else
    847 		PUTU32(ctx->Yi.c+12,ctr);
    848 #endif
    849 	else
    850 		ctx->Yi.d[3] = ctr;
    851 }
    852 
    853 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
    854 {
    855 	size_t i;
    856 	unsigned int n;
    857 	u64 alen = ctx->len.u[0];
    858 #ifdef GCM_FUNCREF_4BIT
    859 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
    860 # ifdef GHASH
    861 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
    862 				const u8 *inp,size_t len)	= ctx->ghash;
    863 # endif
    864 #endif
    865 
    866 	if (ctx->len.u[1]) return -2;
    867 
    868 	alen += len;
    869 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
    870 		return -1;
    871 	ctx->len.u[0] = alen;
    872 
    873 	n = ctx->ares;
    874 	if (n) {
    875 		while (n && len) {
    876 			ctx->Xi.c[n] ^= *(aad++);
    877 			--len;
    878 			n = (n+1)%16;
    879 		}
    880 		if (n==0) GCM_MUL(ctx,Xi);
    881 		else {
    882 			ctx->ares = n;
    883 			return 0;
    884 		}
    885 	}
    886 
    887 #ifdef GHASH
    888 	if ((i = (len&(size_t)-16))) {
    889 		GHASH(ctx,aad,i);
    890 		aad += i;
    891 		len -= i;
    892 	}
    893 #else
    894 	while (len>=16) {
    895 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
    896 		GCM_MUL(ctx,Xi);
    897 		aad += 16;
    898 		len -= 16;
    899 	}
    900 #endif
    901 	if (len) {
    902 		n = (unsigned int)len;
    903 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
    904 	}
    905 
    906 	ctx->ares = n;
    907 	return 0;
    908 }
    909 
    910 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
    911 		const unsigned char *in, unsigned char *out,
    912 		size_t len)
    913 {
    914 	const union { long one; char little; } is_endian = {1};
    915 	unsigned int n, ctr;
    916 	size_t i;
    917 	u64        mlen  = ctx->len.u[1];
    918 	block128_f block = ctx->block;
    919 	void      *key   = ctx->key;
    920 #ifdef GCM_FUNCREF_4BIT
    921 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
    922 # ifdef GHASH
    923 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
    924 				const u8 *inp,size_t len)	= ctx->ghash;
    925 # endif
    926 #endif
    927 
    928 #if 0
    929 	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
    930 #endif
    931 	mlen += len;
    932 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
    933 		return -1;
    934 	ctx->len.u[1] = mlen;
    935 
    936 	if (ctx->ares) {
    937 		/* First call to encrypt finalizes GHASH(AAD) */
    938 		GCM_MUL(ctx,Xi);
    939 		ctx->ares = 0;
    940 	}
    941 
    942 	if (is_endian.little)
    943 #ifdef BSWAP4
    944 		ctr = BSWAP4(ctx->Yi.d[3]);
    945 #else
    946 		ctr = GETU32(ctx->Yi.c+12);
    947 #endif
    948 	else
    949 		ctr = ctx->Yi.d[3];
    950 
    951 	n = ctx->mres;
    952 #if !defined(OPENSSL_SMALL_FOOTPRINT)
    953 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
    954 		if (n) {
    955 			while (n && len) {
    956 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
    957 				--len;
    958 				n = (n+1)%16;
    959 			}
    960 			if (n==0) GCM_MUL(ctx,Xi);
    961 			else {
    962 				ctx->mres = n;
    963 				return 0;
    964 			}
    965 		}
    966 #if defined(STRICT_ALIGNMENT)
    967 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
    968 			break;
    969 #endif
    970 #if defined(GHASH) && defined(GHASH_CHUNK)
    971 		while (len>=GHASH_CHUNK) {
    972 		    size_t j=GHASH_CHUNK;
    973 
    974 		    while (j) {
    975 		    	size_t *out_t=(size_t *)out;
    976 		    	const size_t *in_t=(const size_t *)in;
    977 
    978 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
    979 			++ctr;
    980 			if (is_endian.little)
    981 #ifdef BSWAP4
    982 				ctx->Yi.d[3] = BSWAP4(ctr);
    983 #else
    984 				PUTU32(ctx->Yi.c+12,ctr);
    985 #endif
    986 			else
    987 				ctx->Yi.d[3] = ctr;
    988 			for (i=0; i<16/sizeof(size_t); ++i)
    989 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
    990 			out += 16;
    991 			in  += 16;
    992 			j   -= 16;
    993 		    }
    994 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
    995 		    len -= GHASH_CHUNK;
    996 		}
    997 		if ((i = (len&(size_t)-16))) {
    998 		    size_t j=i;
    999 
   1000 		    while (len>=16) {
   1001 		    	size_t *out_t=(size_t *)out;
   1002 		    	const size_t *in_t=(const size_t *)in;
   1003 
   1004 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1005 			++ctr;
   1006 			if (is_endian.little)
   1007 #ifdef BSWAP4
   1008 				ctx->Yi.d[3] = BSWAP4(ctr);
   1009 #else
   1010 				PUTU32(ctx->Yi.c+12,ctr);
   1011 #endif
   1012 			else
   1013 				ctx->Yi.d[3] = ctr;
   1014 			for (i=0; i<16/sizeof(size_t); ++i)
   1015 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
   1016 			out += 16;
   1017 			in  += 16;
   1018 			len -= 16;
   1019 		    }
   1020 		    GHASH(ctx,out-j,j);
   1021 		}
   1022 #else
   1023 		while (len>=16) {
   1024 		    	size_t *out_t=(size_t *)out;
   1025 		    	const size_t *in_t=(const size_t *)in;
   1026 
   1027 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1028 			++ctr;
   1029 			if (is_endian.little)
   1030 #ifdef BSWAP4
   1031 				ctx->Yi.d[3] = BSWAP4(ctr);
   1032 #else
   1033 				PUTU32(ctx->Yi.c+12,ctr);
   1034 #endif
   1035 			else
   1036 				ctx->Yi.d[3] = ctr;
   1037 			for (i=0; i<16/sizeof(size_t); ++i)
   1038 				ctx->Xi.t[i] ^=
   1039 				out_t[i] = in_t[i]^ctx->EKi.t[i];
   1040 			GCM_MUL(ctx,Xi);
   1041 			out += 16;
   1042 			in  += 16;
   1043 			len -= 16;
   1044 		}
   1045 #endif
   1046 		if (len) {
   1047 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1048 			++ctr;
   1049 			if (is_endian.little)
   1050 #ifdef BSWAP4
   1051 				ctx->Yi.d[3] = BSWAP4(ctr);
   1052 #else
   1053 				PUTU32(ctx->Yi.c+12,ctr);
   1054 #endif
   1055 			else
   1056 				ctx->Yi.d[3] = ctr;
   1057 			while (len--) {
   1058 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
   1059 				++n;
   1060 			}
   1061 		}
   1062 
   1063 		ctx->mres = n;
   1064 		return 0;
   1065 	} while(0);
   1066 #endif
   1067 	for (i=0;i<len;++i) {
   1068 		if (n==0) {
   1069 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1070 			++ctr;
   1071 			if (is_endian.little)
   1072 #ifdef BSWAP4
   1073 				ctx->Yi.d[3] = BSWAP4(ctr);
   1074 #else
   1075 				PUTU32(ctx->Yi.c+12,ctr);
   1076 #endif
   1077 			else
   1078 				ctx->Yi.d[3] = ctr;
   1079 		}
   1080 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
   1081 		n = (n+1)%16;
   1082 		if (n==0)
   1083 			GCM_MUL(ctx,Xi);
   1084 	}
   1085 
   1086 	ctx->mres = n;
   1087 	return 0;
   1088 }
   1089 
   1090 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
   1091 		const unsigned char *in, unsigned char *out,
   1092 		size_t len)
   1093 {
   1094 	const union { long one; char little; } is_endian = {1};
   1095 	unsigned int n, ctr;
   1096 	size_t i;
   1097 	u64        mlen  = ctx->len.u[1];
   1098 	block128_f block = ctx->block;
   1099 	void      *key   = ctx->key;
   1100 #ifdef GCM_FUNCREF_4BIT
   1101 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
   1102 # ifdef GHASH
   1103 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
   1104 				const u8 *inp,size_t len)	= ctx->ghash;
   1105 # endif
   1106 #endif
   1107 
   1108 	mlen += len;
   1109 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
   1110 		return -1;
   1111 	ctx->len.u[1] = mlen;
   1112 
   1113 	if (ctx->ares) {
   1114 		/* First call to decrypt finalizes GHASH(AAD) */
   1115 		GCM_MUL(ctx,Xi);
   1116 		ctx->ares = 0;
   1117 	}
   1118 
   1119 	if (is_endian.little)
   1120 #ifdef BSWAP4
   1121 		ctr = BSWAP4(ctx->Yi.d[3]);
   1122 #else
   1123 		ctr = GETU32(ctx->Yi.c+12);
   1124 #endif
   1125 	else
   1126 		ctr = ctx->Yi.d[3];
   1127 
   1128 	n = ctx->mres;
   1129 #if !defined(OPENSSL_SMALL_FOOTPRINT)
   1130 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
   1131 		if (n) {
   1132 			while (n && len) {
   1133 				u8 c = *(in++);
   1134 				*(out++) = c^ctx->EKi.c[n];
   1135 				ctx->Xi.c[n] ^= c;
   1136 				--len;
   1137 				n = (n+1)%16;
   1138 			}
   1139 			if (n==0) GCM_MUL (ctx,Xi);
   1140 			else {
   1141 				ctx->mres = n;
   1142 				return 0;
   1143 			}
   1144 		}
   1145 #if defined(STRICT_ALIGNMENT)
   1146 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
   1147 			break;
   1148 #endif
   1149 #if defined(GHASH) && defined(GHASH_CHUNK)
   1150 		while (len>=GHASH_CHUNK) {
   1151 		    size_t j=GHASH_CHUNK;
   1152 
   1153 		    GHASH(ctx,in,GHASH_CHUNK);
   1154 		    while (j) {
   1155 		    	size_t *out_t=(size_t *)out;
   1156 		    	const size_t *in_t=(const size_t *)in;
   1157 
   1158 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1159 			++ctr;
   1160 			if (is_endian.little)
   1161 #ifdef BSWAP4
   1162 				ctx->Yi.d[3] = BSWAP4(ctr);
   1163 #else
   1164 				PUTU32(ctx->Yi.c+12,ctr);
   1165 #endif
   1166 			else
   1167 				ctx->Yi.d[3] = ctr;
   1168 			for (i=0; i<16/sizeof(size_t); ++i)
   1169 				out_t[i] = in_t[i]^ctx->EKi.t[i];
   1170 			out += 16;
   1171 			in  += 16;
   1172 			j   -= 16;
   1173 		    }
   1174 		    len -= GHASH_CHUNK;
   1175 		}
   1176 		if ((i = (len&(size_t)-16))) {
   1177 		    GHASH(ctx,in,i);
   1178 		    while (len>=16) {
   1179 		    	size_t *out_t=(size_t *)out;
   1180 		    	const size_t *in_t=(const size_t *)in;
   1181 
   1182 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1183 			++ctr;
   1184 			if (is_endian.little)
   1185 #ifdef BSWAP4
   1186 				ctx->Yi.d[3] = BSWAP4(ctr);
   1187 #else
   1188 				PUTU32(ctx->Yi.c+12,ctr);
   1189 #endif
   1190 			else
   1191 				ctx->Yi.d[3] = ctr;
   1192 			for (i=0; i<16/sizeof(size_t); ++i)
   1193 				out_t[i] = in_t[i]^ctx->EKi.t[i];
   1194 			out += 16;
   1195 			in  += 16;
   1196 			len -= 16;
   1197 		    }
   1198 		}
   1199 #else
   1200 		while (len>=16) {
   1201 		    	size_t *out_t=(size_t *)out;
   1202 		    	const size_t *in_t=(const size_t *)in;
   1203 
   1204 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1205 			++ctr;
   1206 			if (is_endian.little)
   1207 #ifdef BSWAP4
   1208 				ctx->Yi.d[3] = BSWAP4(ctr);
   1209 #else
   1210 				PUTU32(ctx->Yi.c+12,ctr);
   1211 #endif
   1212 			else
   1213 				ctx->Yi.d[3] = ctr;
   1214 			for (i=0; i<16/sizeof(size_t); ++i) {
   1215 				size_t c = in[i];
   1216 				out[i] = c^ctx->EKi.t[i];
   1217 				ctx->Xi.t[i] ^= c;
   1218 			}
   1219 			GCM_MUL(ctx,Xi);
   1220 			out += 16;
   1221 			in  += 16;
   1222 			len -= 16;
   1223 		}
   1224 #endif
   1225 		if (len) {
   1226 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1227 			++ctr;
   1228 			if (is_endian.little)
   1229 #ifdef BSWAP4
   1230 				ctx->Yi.d[3] = BSWAP4(ctr);
   1231 #else
   1232 				PUTU32(ctx->Yi.c+12,ctr);
   1233 #endif
   1234 			else
   1235 				ctx->Yi.d[3] = ctr;
   1236 			while (len--) {
   1237 				u8 c = in[n];
   1238 				ctx->Xi.c[n] ^= c;
   1239 				out[n] = c^ctx->EKi.c[n];
   1240 				++n;
   1241 			}
   1242 		}
   1243 
   1244 		ctx->mres = n;
   1245 		return 0;
   1246 	} while(0);
   1247 #endif
   1248 	for (i=0;i<len;++i) {
   1249 		u8 c;
   1250 		if (n==0) {
   1251 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
   1252 			++ctr;
   1253 			if (is_endian.little)
   1254 #ifdef BSWAP4
   1255 				ctx->Yi.d[3] = BSWAP4(ctr);
   1256 #else
   1257 				PUTU32(ctx->Yi.c+12,ctr);
   1258 #endif
   1259 			else
   1260 				ctx->Yi.d[3] = ctr;
   1261 		}
   1262 		c = in[i];
   1263 		out[i] = c^ctx->EKi.c[n];
   1264 		ctx->Xi.c[n] ^= c;
   1265 		n = (n+1)%16;
   1266 		if (n==0)
   1267 			GCM_MUL(ctx,Xi);
   1268 	}
   1269 
   1270 	ctx->mres = n;
   1271 	return 0;
   1272 }
   1273 
   1274 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
   1275 		const unsigned char *in, unsigned char *out,
   1276 		size_t len, ctr128_f stream)
   1277 {
   1278 	const union { long one; char little; } is_endian = {1};
   1279 	unsigned int n, ctr;
   1280 	size_t i;
   1281 	u64   mlen = ctx->len.u[1];
   1282 	void *key  = ctx->key;
   1283 #ifdef GCM_FUNCREF_4BIT
   1284 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
   1285 # ifdef GHASH
   1286 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
   1287 				const u8 *inp,size_t len)	= ctx->ghash;
   1288 # endif
   1289 #endif
   1290 
   1291 	mlen += len;
   1292 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
   1293 		return -1;
   1294 	ctx->len.u[1] = mlen;
   1295 
   1296 	if (ctx->ares) {
   1297 		/* First call to encrypt finalizes GHASH(AAD) */
   1298 		GCM_MUL(ctx,Xi);
   1299 		ctx->ares = 0;
   1300 	}
   1301 
   1302 	if (is_endian.little)
   1303 #ifdef BSWAP4
   1304 		ctr = BSWAP4(ctx->Yi.d[3]);
   1305 #else
   1306 		ctr = GETU32(ctx->Yi.c+12);
   1307 #endif
   1308 	else
   1309 		ctr = ctx->Yi.d[3];
   1310 
   1311 	n = ctx->mres;
   1312 	if (n) {
   1313 		while (n && len) {
   1314 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
   1315 			--len;
   1316 			n = (n+1)%16;
   1317 		}
   1318 		if (n==0) GCM_MUL(ctx,Xi);
   1319 		else {
   1320 			ctx->mres = n;
   1321 			return 0;
   1322 		}
   1323 	}
   1324 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
   1325 	while (len>=GHASH_CHUNK) {
   1326 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
   1327 		ctr += GHASH_CHUNK/16;
   1328 		if (is_endian.little)
   1329 #ifdef BSWAP4
   1330 			ctx->Yi.d[3] = BSWAP4(ctr);
   1331 #else
   1332 			PUTU32(ctx->Yi.c+12,ctr);
   1333 #endif
   1334 		else
   1335 			ctx->Yi.d[3] = ctr;
   1336 		GHASH(ctx,out,GHASH_CHUNK);
   1337 		out += GHASH_CHUNK;
   1338 		in  += GHASH_CHUNK;
   1339 		len -= GHASH_CHUNK;
   1340 	}
   1341 #endif
   1342 	if ((i = (len&(size_t)-16))) {
   1343 		size_t j=i/16;
   1344 
   1345 		(*stream)(in,out,j,key,ctx->Yi.c);
   1346 		ctr += (unsigned int)j;
   1347 		if (is_endian.little)
   1348 #ifdef BSWAP4
   1349 			ctx->Yi.d[3] = BSWAP4(ctr);
   1350 #else
   1351 			PUTU32(ctx->Yi.c+12,ctr);
   1352 #endif
   1353 		else
   1354 			ctx->Yi.d[3] = ctr;
   1355 		in  += i;
   1356 		len -= i;
   1357 #if defined(GHASH)
   1358 		GHASH(ctx,out,i);
   1359 		out += i;
   1360 #else
   1361 		while (j--) {
   1362 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
   1363 			GCM_MUL(ctx,Xi);
   1364 			out += 16;
   1365 		}
   1366 #endif
   1367 	}
   1368 	if (len) {
   1369 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
   1370 		++ctr;
   1371 		if (is_endian.little)
   1372 #ifdef BSWAP4
   1373 			ctx->Yi.d[3] = BSWAP4(ctr);
   1374 #else
   1375 			PUTU32(ctx->Yi.c+12,ctr);
   1376 #endif
   1377 		else
   1378 			ctx->Yi.d[3] = ctr;
   1379 		while (len--) {
   1380 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
   1381 			++n;
   1382 		}
   1383 	}
   1384 
   1385 	ctx->mres = n;
   1386 	return 0;
   1387 }
   1388 
   1389 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
   1390 		const unsigned char *in, unsigned char *out,
   1391 		size_t len,ctr128_f stream)
   1392 {
   1393 	const union { long one; char little; } is_endian = {1};
   1394 	unsigned int n, ctr;
   1395 	size_t i;
   1396 	u64   mlen = ctx->len.u[1];
   1397 	void *key  = ctx->key;
   1398 #ifdef GCM_FUNCREF_4BIT
   1399 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
   1400 # ifdef GHASH
   1401 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
   1402 				const u8 *inp,size_t len)	= ctx->ghash;
   1403 # endif
   1404 #endif
   1405 
   1406 	mlen += len;
   1407 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
   1408 		return -1;
   1409 	ctx->len.u[1] = mlen;
   1410 
   1411 	if (ctx->ares) {
   1412 		/* First call to decrypt finalizes GHASH(AAD) */
   1413 		GCM_MUL(ctx,Xi);
   1414 		ctx->ares = 0;
   1415 	}
   1416 
   1417 	if (is_endian.little)
   1418 #ifdef BSWAP4
   1419 		ctr = BSWAP4(ctx->Yi.d[3]);
   1420 #else
   1421 		ctr = GETU32(ctx->Yi.c+12);
   1422 #endif
   1423 	else
   1424 		ctr = ctx->Yi.d[3];
   1425 
   1426 	n = ctx->mres;
   1427 	if (n) {
   1428 		while (n && len) {
   1429 			u8 c = *(in++);
   1430 			*(out++) = c^ctx->EKi.c[n];
   1431 			ctx->Xi.c[n] ^= c;
   1432 			--len;
   1433 			n = (n+1)%16;
   1434 		}
   1435 		if (n==0) GCM_MUL (ctx,Xi);
   1436 		else {
   1437 			ctx->mres = n;
   1438 			return 0;
   1439 		}
   1440 	}
   1441 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
   1442 	while (len>=GHASH_CHUNK) {
   1443 		GHASH(ctx,in,GHASH_CHUNK);
   1444 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
   1445 		ctr += GHASH_CHUNK/16;
   1446 		if (is_endian.little)
   1447 #ifdef BSWAP4
   1448 			ctx->Yi.d[3] = BSWAP4(ctr);
   1449 #else
   1450 			PUTU32(ctx->Yi.c+12,ctr);
   1451 #endif
   1452 		else
   1453 			ctx->Yi.d[3] = ctr;
   1454 		out += GHASH_CHUNK;
   1455 		in  += GHASH_CHUNK;
   1456 		len -= GHASH_CHUNK;
   1457 	}
   1458 #endif
   1459 	if ((i = (len&(size_t)-16))) {
   1460 		size_t j=i/16;
   1461 
   1462 #if defined(GHASH)
   1463 		GHASH(ctx,in,i);
   1464 #else
   1465 		while (j--) {
   1466 			size_t k;
   1467 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
   1468 			GCM_MUL(ctx,Xi);
   1469 			in += 16;
   1470 		}
   1471 		j   = i/16;
   1472 		in -= i;
   1473 #endif
   1474 		(*stream)(in,out,j,key,ctx->Yi.c);
   1475 		ctr += (unsigned int)j;
   1476 		if (is_endian.little)
   1477 #ifdef BSWAP4
   1478 			ctx->Yi.d[3] = BSWAP4(ctr);
   1479 #else
   1480 			PUTU32(ctx->Yi.c+12,ctr);
   1481 #endif
   1482 		else
   1483 			ctx->Yi.d[3] = ctr;
   1484 		out += i;
   1485 		in  += i;
   1486 		len -= i;
   1487 	}
   1488 	if (len) {
   1489 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
   1490 		++ctr;
   1491 		if (is_endian.little)
   1492 #ifdef BSWAP4
   1493 			ctx->Yi.d[3] = BSWAP4(ctr);
   1494 #else
   1495 			PUTU32(ctx->Yi.c+12,ctr);
   1496 #endif
   1497 		else
   1498 			ctx->Yi.d[3] = ctr;
   1499 		while (len--) {
   1500 			u8 c = in[n];
   1501 			ctx->Xi.c[n] ^= c;
   1502 			out[n] = c^ctx->EKi.c[n];
   1503 			++n;
   1504 		}
   1505 	}
   1506 
   1507 	ctx->mres = n;
   1508 	return 0;
   1509 }
   1510 
   1511 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
   1512 			size_t len)
   1513 {
   1514 	const union { long one; char little; } is_endian = {1};
   1515 	u64 alen = ctx->len.u[0]<<3;
   1516 	u64 clen = ctx->len.u[1]<<3;
   1517 #ifdef GCM_FUNCREF_4BIT
   1518 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
   1519 #endif
   1520 
   1521 	if (ctx->mres || ctx->ares)
   1522 		GCM_MUL(ctx,Xi);
   1523 
   1524 	if (is_endian.little) {
   1525 #ifdef BSWAP8
   1526 		alen = BSWAP8(alen);
   1527 		clen = BSWAP8(clen);
   1528 #else
   1529 		u8 *p = ctx->len.c;
   1530 
   1531 		ctx->len.u[0] = alen;
   1532 		ctx->len.u[1] = clen;
   1533 
   1534 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
   1535 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
   1536 #endif
   1537 	}
   1538 
   1539 	ctx->Xi.u[0] ^= alen;
   1540 	ctx->Xi.u[1] ^= clen;
   1541 	GCM_MUL(ctx,Xi);
   1542 
   1543 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
   1544 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
   1545 
   1546 	if (tag && len<=sizeof(ctx->Xi))
   1547 		return memcmp(ctx->Xi.c,tag,len);
   1548 	else
   1549 		return -1;
   1550 }
   1551 
   1552 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
   1553 {
   1554 	CRYPTO_gcm128_finish(ctx, NULL, 0);
   1555 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
   1556 }
   1557 
   1558 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
   1559 {
   1560 	GCM128_CONTEXT *ret;
   1561 
   1562 	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
   1563 		CRYPTO_gcm128_init(ret,key,block);
   1564 
   1565 	return ret;
   1566 }
   1567 
   1568 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
   1569 {
   1570 	if (ctx) {
   1571 		OPENSSL_cleanse(ctx,sizeof(*ctx));
   1572 		OPENSSL_free(ctx);
   1573 	}
   1574 }
   1575 
   1576 #if defined(SELFTEST)
   1577 #include <stdio.h>
   1578 #include <openssl/aes.h>
   1579 
   1580 /* Test Case 1 */
   1581 static const u8	K1[16],
   1582 		*P1=NULL,
   1583 		*A1=NULL,
   1584 		IV1[12],
   1585 		*C1=NULL,
   1586 		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
   1587 
   1588 /* Test Case 2 */
   1589 #define K2 K1
   1590 #define A2 A1
   1591 #define IV2 IV1
   1592 static const u8	P2[16],
   1593 		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
   1594 		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
   1595 
   1596 /* Test Case 3 */
   1597 #define A3 A2
   1598 static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
   1599 		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1600 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1601 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1602 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
   1603 		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
   1604 		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
   1605 			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
   1606 			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
   1607 			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
   1608 		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
   1609 
   1610 /* Test Case 4 */
   1611 #define K4 K3
   1612 #define IV4 IV3
   1613 static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1614 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1615 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1616 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
   1617 		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
   1618 			0xab,0xad,0xda,0xd2},
   1619 		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
   1620 			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
   1621 			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
   1622 			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
   1623 		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
   1624 
   1625 /* Test Case 5 */
   1626 #define K5 K4
   1627 #define P5 P4
   1628 #define A5 A4
   1629 static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
   1630 		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
   1631 			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
   1632 			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
   1633 			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
   1634 		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
   1635 
   1636 /* Test Case 6 */
   1637 #define K6 K5
   1638 #define P6 P5
   1639 #define A6 A5
   1640 static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
   1641 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
   1642 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
   1643 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
   1644 		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
   1645 			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
   1646 			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
   1647 			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
   1648 		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
   1649 
   1650 /* Test Case 7 */
   1651 static const u8 K7[24],
   1652 		*P7=NULL,
   1653 		*A7=NULL,
   1654 		IV7[12],
   1655 		*C7=NULL,
   1656 		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
   1657 
   1658 /* Test Case 8 */
   1659 #define K8 K7
   1660 #define IV8 IV7
   1661 #define A8 A7
   1662 static const u8	P8[16],
   1663 		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
   1664 		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
   1665 
   1666 /* Test Case 9 */
   1667 #define A9 A8
   1668 static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
   1669 			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
   1670 		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1671 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1672 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1673 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
   1674 		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
   1675 		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
   1676 			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
   1677 			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
   1678 			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
   1679 		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
   1680 
   1681 /* Test Case 10 */
   1682 #define K10 K9
   1683 #define IV10 IV9
   1684 static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1685 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1686 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1687 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
   1688 		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
   1689 			0xab,0xad,0xda,0xd2},
   1690 		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
   1691 			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
   1692 			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
   1693 			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
   1694 		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
   1695 
   1696 /* Test Case 11 */
   1697 #define K11 K10
   1698 #define P11 P10
   1699 #define A11 A10
   1700 static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
   1701 		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
   1702 			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
   1703 			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
   1704 			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
   1705 		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
   1706 
   1707 /* Test Case 12 */
   1708 #define K12 K11
   1709 #define P12 P11
   1710 #define A12 A11
   1711 static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
   1712 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
   1713 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
   1714 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
   1715 		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
   1716 			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
   1717 			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
   1718 			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
   1719 		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
   1720 
   1721 /* Test Case 13 */
   1722 static const u8	K13[32],
   1723 		*P13=NULL,
   1724 		*A13=NULL,
   1725 		IV13[12],
   1726 		*C13=NULL,
   1727 		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
   1728 
   1729 /* Test Case 14 */
   1730 #define K14 K13
   1731 #define A14 A13
   1732 static const u8	P14[16],
   1733 		IV14[12],
   1734 		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
   1735 		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
   1736 
   1737 /* Test Case 15 */
   1738 #define A15 A14
   1739 static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
   1740 			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
   1741 		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1742 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1743 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1744 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
   1745 		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
   1746 		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
   1747 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
   1748 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
   1749 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
   1750 		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
   1751 
   1752 /* Test Case 16 */
   1753 #define K16 K15
   1754 #define IV16 IV15
   1755 static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1756 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1757 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1758 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
   1759 		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
   1760 			0xab,0xad,0xda,0xd2},
   1761 		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
   1762 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
   1763 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
   1764 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
   1765 		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
   1766 
   1767 /* Test Case 17 */
   1768 #define K17 K16
   1769 #define P17 P16
   1770 #define A17 A16
   1771 static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
   1772 		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
   1773 			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
   1774 			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
   1775 			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
   1776 		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
   1777 
   1778 /* Test Case 18 */
   1779 #define K18 K17
   1780 #define P18 P17
   1781 #define A18 A17
   1782 static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
   1783 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
   1784 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
   1785 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
   1786 		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
   1787 			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
   1788 			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
   1789 			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
   1790 		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
   1791 
   1792 /* Test Case 19 */
   1793 #define K19 K1
   1794 #define P19 P1
   1795 #define IV19 IV1
   1796 #define C19 C1
   1797 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
   1798 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
   1799 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
   1800 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
   1801 			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
   1802 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
   1803 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
   1804 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
   1805 		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
   1806 
   1807 /* Test Case 20 */
   1808 #define K20 K1
   1809 #define A20 A1
   1810 static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */
   1811 		P20[288],
   1812 		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
   1813 			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
   1814 			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
   1815 			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
   1816 			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
   1817 			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
   1818 			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
   1819 			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
   1820 			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
   1821 			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
   1822 			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
   1823 			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
   1824 			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
   1825 			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
   1826 			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
   1827 			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
   1828 			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
   1829 			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
   1830 		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
   1831 
   1832 #define TEST_CASE(n)	do {					\
   1833 	u8 out[sizeof(P##n)];					\
   1834 	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
   1835 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
   1836 	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
   1837 	memset(out,0,sizeof(out));				\
   1838 	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
   1839 	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
   1840 	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
   1841 	    (C##n && memcmp(out,C##n,sizeof(out))))		\
   1842 		ret++, printf ("encrypt test#%d failed.\n",n);	\
   1843 	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
   1844 	memset(out,0,sizeof(out));				\
   1845 	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
   1846 	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
   1847 	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
   1848 	    (P##n && memcmp(out,P##n,sizeof(out))))		\
   1849 		ret++, printf ("decrypt test#%d failed.\n",n);	\
   1850 	} while(0)
   1851 
   1852 int main()
   1853 {
   1854 	GCM128_CONTEXT ctx;
   1855 	AES_KEY key;
   1856 	int ret=0;
   1857 
   1858 	TEST_CASE(1);
   1859 	TEST_CASE(2);
   1860 	TEST_CASE(3);
   1861 	TEST_CASE(4);
   1862 	TEST_CASE(5);
   1863 	TEST_CASE(6);
   1864 	TEST_CASE(7);
   1865 	TEST_CASE(8);
   1866 	TEST_CASE(9);
   1867 	TEST_CASE(10);
   1868 	TEST_CASE(11);
   1869 	TEST_CASE(12);
   1870 	TEST_CASE(13);
   1871 	TEST_CASE(14);
   1872 	TEST_CASE(15);
   1873 	TEST_CASE(16);
   1874 	TEST_CASE(17);
   1875 	TEST_CASE(18);
   1876 	TEST_CASE(19);
   1877 	TEST_CASE(20);
   1878 
   1879 #ifdef OPENSSL_CPUID_OBJ
   1880 	{
   1881 	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
   1882 	union { u64 u; u8 c[1024]; } buf;
   1883 	int i;
   1884 
   1885 	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
   1886 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
   1887 	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
   1888 
   1889 	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
   1890 	start = OPENSSL_rdtsc();
   1891 	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
   1892 	gcm_t = OPENSSL_rdtsc() - start;
   1893 
   1894 	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
   1895 			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
   1896 			(block128_f)AES_encrypt);
   1897 	start = OPENSSL_rdtsc();
   1898 	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
   1899 			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
   1900 			(block128_f)AES_encrypt);
   1901 	ctr_t = OPENSSL_rdtsc() - start;
   1902 
   1903 	printf("%.2f-%.2f=%.2f\n",
   1904 			gcm_t/(double)sizeof(buf),
   1905 			ctr_t/(double)sizeof(buf),
   1906 			(gcm_t-ctr_t)/(double)sizeof(buf));
   1907 #ifdef GHASH
   1908 	{
   1909 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
   1910 				const u8 *inp,size_t len)	= ctx.ghash;
   1911 
   1912 	GHASH((&ctx),buf.c,sizeof(buf));
   1913 	start = OPENSSL_rdtsc();
   1914 	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
   1915 	gcm_t = OPENSSL_rdtsc() - start;
   1916 	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
   1917 	}
   1918 #endif
   1919 	}
   1920 #endif
   1921 
   1922 	return ret;
   1923 }
   1924 #endif
   1925