Home | History | Annotate | Download | only in key_blob_utils
      1 /*------------------------------------------------------------------------
      2 / OCB Version 3 Reference Code (Optimized C)     Last modified 12-JUN-2013
      3 /-------------------------------------------------------------------------
      4 / Copyright (c) 2013 Ted Krovetz.
      5 /
      6 / Permission to use, copy, modify, and/or distribute this software for any
      7 / purpose with or without fee is hereby granted, provided that the above
      8 / copyright notice and this permission notice appear in all copies.
      9 /
     10 / THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     11 / WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     12 / MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     13 / ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     14 / WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     15 / ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     16 / OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     17 /
     18 / Phillip Rogaway holds patents relevant to OCB. See the following for
     19 / his patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
     20 /
     21 / Special thanks to Keegan McAllister for suggesting several good improvements
     22 /
     23 / Comments are welcome: Ted Krovetz <ted (at) krovetz.net> - Dedicated to Laurel K
     24 /------------------------------------------------------------------------- */
     25 
     26 /* ----------------------------------------------------------------------- */
     27 /* Usage notes                                                             */
     28 /* ----------------------------------------------------------------------- */
     29 
     30 /* - When AE_PENDING is passed as the 'final' parameter of any function,
     31 /    the length parameters must be a multiple of (BPI*16).
     32 /  - When available, SSE or AltiVec registers are used to manipulate data.
     33 /    So, when on machines with these facilities, all pointers passed to
     34 /    any function should be 16-byte aligned.
     35 /  - Plaintext and ciphertext pointers may be equal (ie, plaintext gets
     36 /    encrypted in-place), but no other pair of pointers may be equal.
     37 /  - This code assumes all x86 processors have SSE2 and SSSE3 instructions
     38 /    when compiling under MSVC. If untrue, alter the #define.
     39 /  - This code is tested for C99 and recent versions of GCC and MSVC.      */
     40 
     41 /* ----------------------------------------------------------------------- */
     42 /* User configuration options                                              */
     43 /* ----------------------------------------------------------------------- */
     44 
     45 /* Set the AES key length to use and length of authentication tag to produce.
     46 /  Setting either to 0 requires the value be set at runtime via ae_init().
     47 /  Some optimizations occur for each when set to a fixed value.            */
     48 #define OCB_KEY_LEN 16 /* 0, 16, 24 or 32. 0 means set in ae_init */
     49 #define OCB_TAG_LEN 16 /* 0 to 16. 0 means set in ae_init         */
     50 
     51 /* This implementation has built-in support for multiple AES APIs. Set any
     52 /  one of the following to non-zero to specify which to use.               */
     53 #define USE_OPENSSL_AES 1   /* http://openssl.org                      */
     54 #define USE_REFERENCE_AES 0 /* Internet search: rijndael-alg-fst.c     */
     55 #define USE_AES_NI 0        /* Uses compiler's intrinsics              */
     56 
     57 /* During encryption and decryption, various "L values" are required.
     58 /  The L values can be precomputed during initialization (requiring extra
     59 /  space in ae_ctx), generated as needed (slightly slowing encryption and
     60 /  decryption), or some combination of the two. L_TABLE_SZ specifies how many
     61 /  L values to precompute. L_TABLE_SZ must be at least 3. L_TABLE_SZ*16 bytes
     62 /  are used for L values in ae_ctx. Plaintext and ciphertexts shorter than
     63 /  2^L_TABLE_SZ blocks need no L values calculated dynamically.            */
     64 #define L_TABLE_SZ 16
     65 
     66 /* Set L_TABLE_SZ_IS_ENOUGH non-zero iff you know that all plaintexts
     67 /  will be shorter than 2^(L_TABLE_SZ+4) bytes in length. This results
     68 /  in better performance.                                                  */
     69 #define L_TABLE_SZ_IS_ENOUGH 1
     70 
     71 /* ----------------------------------------------------------------------- */
     72 /* Includes and compiler specific definitions                              */
     73 /* ----------------------------------------------------------------------- */
     74 
     75 #include <keymaster/key_blob_utils/ae.h>
     76 #include <stdlib.h>
     77 #include <string.h>
     78 
     79 /* Define standard sized integers                                          */
     80 #if defined(_MSC_VER) && (_MSC_VER < 1600)
     81 typedef unsigned __int8 uint8_t;
     82 typedef unsigned __int32 uint32_t;
     83 typedef unsigned __int64 uint64_t;
     84 typedef __int64 int64_t;
     85 #else
     86 #include <stdint.h>
     87 #endif
     88 
     89 /* Compiler-specific intrinsics and fixes: bswap64, ntz                    */
     90 #if _MSC_VER
     91 #define inline __inline                           /* MSVC doesn't recognize "inline" in C */
     92 #define restrict __restrict                       /* MSVC doesn't recognize "restrict" in C */
     93 #define __SSE2__ (_M_IX86 || _M_AMD64 || _M_X64)  /* Assume SSE2  */
     94 #define __SSSE3__ (_M_IX86 || _M_AMD64 || _M_X64) /* Assume SSSE3 */
     95 #include <intrin.h>
     96 #pragma intrinsic(_byteswap_uint64, _BitScanForward, memcpy)
     97 #define bswap64(x) _byteswap_uint64(x)
     98 static inline unsigned ntz(unsigned x) {
     99     _BitScanForward(&x, x);
    100     return x;
    101 }
    102 #elif __GNUC__
    103 #define inline __inline__                   /* No "inline" in GCC ansi C mode */
    104 #define restrict __restrict__               /* No "restrict" in GCC ansi C mode */
    105 #define bswap64(x) __builtin_bswap64(x)     /* Assuming GCC 4.3+ */
    106 #define ntz(x) __builtin_ctz((unsigned)(x)) /* Assuming GCC 3.4+ */
    107 #else /* Assume some C99 features: stdint.h, inline, restrict */
    108 #define bswap32(x)                                                                                 \
    109     ((((x)&0xff000000u) >> 24) | (((x)&0x00ff0000u) >> 8) | (((x)&0x0000ff00u) << 8) |             \
    110      (((x)&0x000000ffu) << 24))
    111 
    112 static inline uint64_t bswap64(uint64_t x) {
    113     union {
    114         uint64_t u64;
    115         uint32_t u32[2];
    116     } in, out;
    117     in.u64 = x;
    118     out.u32[0] = bswap32(in.u32[1]);
    119     out.u32[1] = bswap32(in.u32[0]);
    120     return out.u64;
    121 }
    122 
    123 #if (L_TABLE_SZ <= 9) && (L_TABLE_SZ_IS_ENOUGH) /* < 2^13 byte texts */
    124 static inline unsigned ntz(unsigned x) {
    125     static const unsigned char tz_table[] = {
    126         0, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 6, 2, 3, 2, 4, 2, 3, 2, 5, 2,
    127         3, 2, 4, 2, 3, 2, 7, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 6, 2, 3, 2,
    128         4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 8, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2,
    129         3, 2, 6, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 7, 2, 3, 2, 4, 2, 3, 2,
    130         5, 2, 3, 2, 4, 2, 3, 2, 6, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2};
    131     return tz_table[x / 4];
    132 }
    133 #else                                           /* From http://supertech.csail.mit.edu/papers/debruijn.pdf */
    134 static inline unsigned ntz(unsigned x) {
    135     static const unsigned char tz_table[32] = {0,  1,  28, 2,  29, 14, 24, 3,  30, 22, 20,
    136                                                15, 25, 17, 4,  8,  31, 27, 13, 23, 21, 19,
    137                                                16, 7,  26, 12, 18, 6,  11, 5,  10, 9};
    138     return tz_table[((uint32_t)((x & -x) * 0x077CB531u)) >> 27];
    139 }
    140 #endif
    141 #endif
    142 
    143 /* ----------------------------------------------------------------------- */
    144 /* Define blocks and operations -- Patch if incorrect on your compiler.    */
    145 /* ----------------------------------------------------------------------- */
    146 
    147 #if __SSE2__ && !KEYMASTER_CLANG_TEST_BUILD
    148 #include <xmmintrin.h> /* SSE instructions and _mm_malloc */
    149 #include <emmintrin.h> /* SSE2 instructions               */
    150 typedef __m128i block;
    151 #define xor_block(x, y) _mm_xor_si128(x, y)
    152 #define zero_block() _mm_setzero_si128()
    153 #define unequal_blocks(x, y) (_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) != 0xffff)
    154 #if __SSSE3__ || USE_AES_NI
    155 #include <tmmintrin.h> /* SSSE3 instructions              */
    156 #define swap_if_le(b)                                                                              \
    157     _mm_shuffle_epi8(b, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
    158 #else
    159 static inline block swap_if_le(block b) {
    160     block a = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 1, 2, 3));
    161     a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
    162     a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
    163     return _mm_xor_si128(_mm_srli_epi16(a, 8), _mm_slli_epi16(a, 8));
    164 }
    165 #endif
    166 static inline block gen_offset(uint64_t KtopStr[3], unsigned bot) {
    167     block hi = _mm_load_si128((__m128i*)(KtopStr + 0));  /* hi = B A */
    168     block lo = _mm_loadu_si128((__m128i*)(KtopStr + 1)); /* lo = C B */
    169     __m128i lshift = _mm_cvtsi32_si128(bot);
    170     __m128i rshift = _mm_cvtsi32_si128(64 - bot);
    171     lo = _mm_xor_si128(_mm_sll_epi64(hi, lshift), _mm_srl_epi64(lo, rshift));
    172 #if __SSSE3__ || USE_AES_NI
    173     return _mm_shuffle_epi8(lo, _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
    174 #else
    175     return swap_if_le(_mm_shuffle_epi32(lo, _MM_SHUFFLE(1, 0, 3, 2)));
    176 #endif
    177 }
    178 static inline block double_block(block bl) {
    179     const __m128i mask = _mm_set_epi32(135, 1, 1, 1);
    180     __m128i tmp = _mm_srai_epi32(bl, 31);
    181     tmp = _mm_and_si128(tmp, mask);
    182     tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3));
    183     bl = _mm_slli_epi32(bl, 1);
    184     return _mm_xor_si128(bl, tmp);
    185 }
    186 #elif __ALTIVEC__
    187 #include <altivec.h>
    188 typedef vector unsigned block;
    189 #define xor_block(x, y) vec_xor(x, y)
    190 #define zero_block() vec_splat_u32(0)
    191 #define unequal_blocks(x, y) vec_any_ne(x, y)
    192 #define swap_if_le(b) (b)
    193 #if __PPC64__
    194 block gen_offset(uint64_t KtopStr[3], unsigned bot) {
    195     union {
    196         uint64_t u64[2];
    197         block bl;
    198     } rval;
    199     rval.u64[0] = (KtopStr[0] << bot) | (KtopStr[1] >> (64 - bot));
    200     rval.u64[1] = (KtopStr[1] << bot) | (KtopStr[2] >> (64 - bot));
    201     return rval.bl;
    202 }
    203 #else
    204 /* Special handling: Shifts are mod 32, and no 64-bit types */
    205 block gen_offset(uint64_t KtopStr[3], unsigned bot) {
    206     const vector unsigned k32 = {32, 32, 32, 32};
    207     vector unsigned hi = *(vector unsigned*)(KtopStr + 0);
    208     vector unsigned lo = *(vector unsigned*)(KtopStr + 2);
    209     vector unsigned bot_vec;
    210     if (bot < 32) {
    211         lo = vec_sld(hi, lo, 4);
    212     } else {
    213         vector unsigned t = vec_sld(hi, lo, 4);
    214         lo = vec_sld(hi, lo, 8);
    215         hi = t;
    216         bot = bot - 32;
    217     }
    218     if (bot == 0)
    219         return hi;
    220     *(unsigned*)&bot_vec = bot;
    221     vector unsigned lshift = vec_splat(bot_vec, 0);
    222     vector unsigned rshift = vec_sub(k32, lshift);
    223     hi = vec_sl(hi, lshift);
    224     lo = vec_sr(lo, rshift);
    225     return vec_xor(hi, lo);
    226 }
    227 #endif
    228 static inline block double_block(block b) {
    229     const vector unsigned char mask = {135, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
    230     const vector unsigned char perm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0};
    231     const vector unsigned char shift7 = vec_splat_u8(7);
    232     const vector unsigned char shift1 = vec_splat_u8(1);
    233     vector unsigned char c = (vector unsigned char)b;
    234     vector unsigned char t = vec_sra(c, shift7);
    235     t = vec_and(t, mask);
    236     t = vec_perm(t, t, perm);
    237     c = vec_sl(c, shift1);
    238     return (block)vec_xor(c, t);
    239 }
    240 #elif __ARM_NEON__
    241 #include <arm_neon.h>
    242 typedef int8x16_t block; /* Yay! Endian-neutral reads! */
    243 #define xor_block(x, y) veorq_s8(x, y)
    244 #define zero_block() vdupq_n_s8(0)
    245 static inline int unequal_blocks(block a, block b) {
    246     int64x2_t t = veorq_s64((int64x2_t)a, (int64x2_t)b);
    247     return (vgetq_lane_s64(t, 0) | vgetq_lane_s64(t, 1)) != 0;
    248 }
    249 #define swap_if_le(b) (b) /* Using endian-neutral int8x16_t */
    250 /* KtopStr is reg correct by 64 bits, return mem correct */
    251 block gen_offset(uint64_t KtopStr[3], unsigned bot) {
    252     const union {
    253         unsigned x;
    254         unsigned char endian;
    255     } little = {1};
    256     const int64x2_t k64 = {-64, -64};
    257     /* Copy hi and lo into local variables to ensure proper alignment */
    258     uint64x2_t hi = vld1q_u64(KtopStr + 0); /* hi = A B */
    259     uint64x2_t lo = vld1q_u64(KtopStr + 1); /* lo = B C */
    260     int64x2_t ls = vdupq_n_s64(bot);
    261     int64x2_t rs = vqaddq_s64(k64, ls);
    262     block rval = (block)veorq_u64(vshlq_u64(hi, ls), vshlq_u64(lo, rs));
    263     if (little.endian)
    264         rval = vrev64q_s8(rval);
    265     return rval;
    266 }
    267 static inline block double_block(block b) {
    268     const block mask = {135, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
    269     block tmp = vshrq_n_s8(b, 7);
    270     tmp = vandq_s8(tmp, mask);
    271     tmp = vextq_s8(tmp, tmp, 1); /* Rotate high byte to end */
    272     b = vshlq_n_s8(b, 1);
    273     return veorq_s8(tmp, b);
    274 }
    275 #else
    276 typedef struct { uint64_t l, r; } block;
    277 static inline block xor_block(block x, block y) {
    278     x.l ^= y.l;
    279     x.r ^= y.r;
    280     return x;
    281 }
    282 static inline block zero_block(void) {
    283     const block t = {0, 0};
    284     return t;
    285 }
    286 #define unequal_blocks(x, y) ((((x).l ^ (y).l) | ((x).r ^ (y).r)) != 0)
    287 static inline block swap_if_le(block b) {
    288     const union {
    289         unsigned x;
    290         unsigned char endian;
    291     } little = {1};
    292     if (little.endian) {
    293         block r;
    294         r.l = bswap64(b.l);
    295         r.r = bswap64(b.r);
    296         return r;
    297     } else
    298         return b;
    299 }
    300 
    301 /* KtopStr is reg correct by 64 bits, return mem correct */
    302 block gen_offset(uint64_t KtopStr[3], unsigned bot) {
    303     block rval;
    304     if (bot != 0) {
    305         rval.l = (KtopStr[0] << bot) | (KtopStr[1] >> (64 - bot));
    306         rval.r = (KtopStr[1] << bot) | (KtopStr[2] >> (64 - bot));
    307     } else {
    308         rval.l = KtopStr[0];
    309         rval.r = KtopStr[1];
    310     }
    311     return swap_if_le(rval);
    312 }
    313 
    314 #if __GNUC__ && __arm__
    315 static inline block double_block(block b) {
    316     __asm__("adds %1,%1,%1\n\t"
    317             "adcs %H1,%H1,%H1\n\t"
    318             "adcs %0,%0,%0\n\t"
    319             "adcs %H0,%H0,%H0\n\t"
    320             "it cs\n\t"
    321             "eorcs %1,%1,#135"
    322             : "+r"(b.l), "+r"(b.r)
    323             :
    324             : "cc");
    325     return b;
    326 }
    327 #else
    328 static inline block double_block(block b) {
    329     uint64_t t = (uint64_t)((int64_t)b.l >> 63);
    330     b.l = (b.l + b.l) ^ (b.r >> 63);
    331     b.r = (b.r + b.r) ^ (t & 135);
    332     return b;
    333 }
    334 #endif
    335 
    336 #endif
    337 
    338 /* ----------------------------------------------------------------------- */
    339 /* AES - Code uses OpenSSL API. Other implementations get mapped to it.    */
    340 /* ----------------------------------------------------------------------- */
    341 
    342 /*---------------*/
    343 #if USE_OPENSSL_AES
    344 /*---------------*/
    345 
    346 #include <openssl/aes.h> /* http://openssl.org/ */
    347 
    348 /* How to ECB encrypt an array of blocks, in place                         */
    349 static inline void AES_ecb_encrypt_blks(block* blks, unsigned nblks, AES_KEY* key) {
    350     while (nblks) {
    351         --nblks;
    352         AES_encrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key);
    353     }
    354 }
    355 
    356 static inline void AES_ecb_decrypt_blks(block* blks, unsigned nblks, AES_KEY* key) {
    357     while (nblks) {
    358         --nblks;
    359         AES_decrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key);
    360     }
    361 }
    362 
    363 #define BPI 4 /* Number of blocks in buffer per ECB call */
    364 
    365 /*-------------------*/
    366 #elif USE_REFERENCE_AES
    367 /*-------------------*/
    368 
    369 #include "rijndael-alg-fst.h" /* Barreto's Public-Domain Code */
    370 #if (OCB_KEY_LEN == 0)
    371 typedef struct {
    372     uint32_t rd_key[60];
    373     int rounds;
    374 } AES_KEY;
    375 #define ROUNDS(ctx) ((ctx)->rounds)
    376 #define AES_set_encrypt_key(x, y, z)                                                               \
    377     do {                                                                                           \
    378         rijndaelKeySetupEnc((z)->rd_key, x, y);                                                    \
    379         (z)->rounds = y / 32 + 6;                                                                  \
    380     } while (0)
    381 #define AES_set_decrypt_key(x, y, z)                                                               \
    382     do {                                                                                           \
    383         rijndaelKeySetupDec((z)->rd_key, x, y);                                                    \
    384         (z)->rounds = y / 32 + 6;                                                                  \
    385     } while (0)
    386 #else
    387 typedef struct { uint32_t rd_key[OCB_KEY_LEN + 28]; } AES_KEY;
    388 #define ROUNDS(ctx) (6 + OCB_KEY_LEN / 4)
    389 #define AES_set_encrypt_key(x, y, z) rijndaelKeySetupEnc((z)->rd_key, x, y)
    390 #define AES_set_decrypt_key(x, y, z) rijndaelKeySetupDec((z)->rd_key, x, y)
    391 #endif
    392 #define AES_encrypt(x, y, z) rijndaelEncrypt((z)->rd_key, ROUNDS(z), x, y)
    393 #define AES_decrypt(x, y, z) rijndaelDecrypt((z)->rd_key, ROUNDS(z), x, y)
    394 
    395 static void AES_ecb_encrypt_blks(block* blks, unsigned nblks, AES_KEY* key) {
    396     while (nblks) {
    397         --nblks;
    398         AES_encrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key);
    399     }
    400 }
    401 
    402 void AES_ecb_decrypt_blks(block* blks, unsigned nblks, AES_KEY* key) {
    403     while (nblks) {
    404         --nblks;
    405         AES_decrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key);
    406     }
    407 }
    408 
    409 #define BPI 4 /* Number of blocks in buffer per ECB call */
    410 
    411 /*----------*/
    412 #elif USE_AES_NI
    413 /*----------*/
    414 
    415 #include <wmmintrin.h>
    416 
    417 #if (OCB_KEY_LEN == 0)
    418 typedef struct {
    419     __m128i rd_key[15];
    420     int rounds;
    421 } AES_KEY;
    422 #define ROUNDS(ctx) ((ctx)->rounds)
    423 #else
    424 typedef struct { __m128i rd_key[7 + OCB_KEY_LEN / 4]; } AES_KEY;
    425 #define ROUNDS(ctx) (6 + OCB_KEY_LEN / 4)
    426 #endif
    427 
    428 #define EXPAND_ASSIST(v1, v2, v3, v4, shuff_const, aes_const)                                      \
    429     v2 = _mm_aeskeygenassist_si128(v4, aes_const);                                                 \
    430     v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3), _mm_castsi128_ps(v1), 16));         \
    431     v1 = _mm_xor_si128(v1, v3);                                                                    \
    432     v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3), _mm_castsi128_ps(v1), 140));        \
    433     v1 = _mm_xor_si128(v1, v3);                                                                    \
    434     v2 = _mm_shuffle_epi32(v2, shuff_const);                                                       \
    435     v1 = _mm_xor_si128(v1, v2)
    436 
    437 #define EXPAND192_STEP(idx, aes_const)                                                             \
    438     EXPAND_ASSIST(x0, x1, x2, x3, 85, aes_const);                                                  \
    439     x3 = _mm_xor_si128(x3, _mm_slli_si128(x3, 4));                                                 \
    440     x3 = _mm_xor_si128(x3, _mm_shuffle_epi32(x0, 255));                                            \
    441     kp[idx] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(x0), 68));   \
    442     kp[idx + 1] =                                                                                  \
    443         _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x3), 78));          \
    444     EXPAND_ASSIST(x0, x1, x2, x3, 85, (aes_const * 2));                                            \
    445     x3 = _mm_xor_si128(x3, _mm_slli_si128(x3, 4));                                                 \
    446     x3 = _mm_xor_si128(x3, _mm_shuffle_epi32(x0, 255));                                            \
    447     kp[idx + 2] = x0;                                                                              \
    448     tmp = x3
    449 
    450 static void AES_128_Key_Expansion(const unsigned char* userkey, void* key) {
    451     __m128i x0, x1, x2;
    452     __m128i* kp = (__m128i*)key;
    453     kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
    454     x2 = _mm_setzero_si128();
    455     EXPAND_ASSIST(x0, x1, x2, x0, 255, 1);
    456     kp[1] = x0;
    457     EXPAND_ASSIST(x0, x1, x2, x0, 255, 2);
    458     kp[2] = x0;
    459     EXPAND_ASSIST(x0, x1, x2, x0, 255, 4);
    460     kp[3] = x0;
    461     EXPAND_ASSIST(x0, x1, x2, x0, 255, 8);
    462     kp[4] = x0;
    463     EXPAND_ASSIST(x0, x1, x2, x0, 255, 16);
    464     kp[5] = x0;
    465     EXPAND_ASSIST(x0, x1, x2, x0, 255, 32);
    466     kp[6] = x0;
    467     EXPAND_ASSIST(x0, x1, x2, x0, 255, 64);
    468     kp[7] = x0;
    469     EXPAND_ASSIST(x0, x1, x2, x0, 255, 128);
    470     kp[8] = x0;
    471     EXPAND_ASSIST(x0, x1, x2, x0, 255, 27);
    472     kp[9] = x0;
    473     EXPAND_ASSIST(x0, x1, x2, x0, 255, 54);
    474     kp[10] = x0;
    475 }
    476 
    477 static void AES_192_Key_Expansion(const unsigned char* userkey, void* key) {
    478     __m128i x0, x1, x2, x3, tmp, *kp = (__m128i*)key;
    479     kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
    480     tmp = x3 = _mm_loadu_si128((__m128i*)(userkey + 16));
    481     x2 = _mm_setzero_si128();
    482     EXPAND192_STEP(1, 1);
    483     EXPAND192_STEP(4, 4);
    484     EXPAND192_STEP(7, 16);
    485     EXPAND192_STEP(10, 64);
    486 }
    487 
    488 static void AES_256_Key_Expansion(const unsigned char* userkey, void* key) {
    489     __m128i x0, x1, x2, x3, *kp = (__m128i*)key;
    490     kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
    491     kp[1] = x3 = _mm_loadu_si128((__m128i*)(userkey + 16));
    492     x2 = _mm_setzero_si128();
    493     EXPAND_ASSIST(x0, x1, x2, x3, 255, 1);
    494     kp[2] = x0;
    495     EXPAND_ASSIST(x3, x1, x2, x0, 170, 1);
    496     kp[3] = x3;
    497     EXPAND_ASSIST(x0, x1, x2, x3, 255, 2);
    498     kp[4] = x0;
    499     EXPAND_ASSIST(x3, x1, x2, x0, 170, 2);
    500     kp[5] = x3;
    501     EXPAND_ASSIST(x0, x1, x2, x3, 255, 4);
    502     kp[6] = x0;
    503     EXPAND_ASSIST(x3, x1, x2, x0, 170, 4);
    504     kp[7] = x3;
    505     EXPAND_ASSIST(x0, x1, x2, x3, 255, 8);
    506     kp[8] = x0;
    507     EXPAND_ASSIST(x3, x1, x2, x0, 170, 8);
    508     kp[9] = x3;
    509     EXPAND_ASSIST(x0, x1, x2, x3, 255, 16);
    510     kp[10] = x0;
    511     EXPAND_ASSIST(x3, x1, x2, x0, 170, 16);
    512     kp[11] = x3;
    513     EXPAND_ASSIST(x0, x1, x2, x3, 255, 32);
    514     kp[12] = x0;
    515     EXPAND_ASSIST(x3, x1, x2, x0, 170, 32);
    516     kp[13] = x3;
    517     EXPAND_ASSIST(x0, x1, x2, x3, 255, 64);
    518     kp[14] = x0;
    519 }
    520 
    521 static int AES_set_encrypt_key(const unsigned char* userKey, const int bits, AES_KEY* key) {
    522     if (bits == 128) {
    523         AES_128_Key_Expansion(userKey, key);
    524     } else if (bits == 192) {
    525         AES_192_Key_Expansion(userKey, key);
    526     } else if (bits == 256) {
    527         AES_256_Key_Expansion(userKey, key);
    528     }
    529 #if (OCB_KEY_LEN == 0)
    530     key->rounds = 6 + bits / 32;
    531 #endif
    532     return 0;
    533 }
    534 
    535 static void AES_set_decrypt_key_fast(AES_KEY* dkey, const AES_KEY* ekey) {
    536     int j = 0;
    537     int i = ROUNDS(ekey);
    538 #if (OCB_KEY_LEN == 0)
    539     dkey->rounds = i;
    540 #endif
    541     dkey->rd_key[i--] = ekey->rd_key[j++];
    542     while (i)
    543         dkey->rd_key[i--] = _mm_aesimc_si128(ekey->rd_key[j++]);
    544     dkey->rd_key[i] = ekey->rd_key[j];
    545 }
    546 
    547 static int AES_set_decrypt_key(const unsigned char* userKey, const int bits, AES_KEY* key) {
    548     AES_KEY temp_key;
    549     AES_set_encrypt_key(userKey, bits, &temp_key);
    550     AES_set_decrypt_key_fast(key, &temp_key);
    551     return 0;
    552 }
    553 
    554 static inline void AES_encrypt(const unsigned char* in, unsigned char* out, const AES_KEY* key) {
    555     int j, rnds = ROUNDS(key);
    556     const __m128i* sched = ((__m128i*)(key->rd_key));
    557     __m128i tmp = _mm_load_si128((__m128i*)in);
    558     tmp = _mm_xor_si128(tmp, sched[0]);
    559     for (j = 1; j < rnds; j++)
    560         tmp = _mm_aesenc_si128(tmp, sched[j]);
    561     tmp = _mm_aesenclast_si128(tmp, sched[j]);
    562     _mm_store_si128((__m128i*)out, tmp);
    563 }
    564 
    565 static inline void AES_decrypt(const unsigned char* in, unsigned char* out, const AES_KEY* key) {
    566     int j, rnds = ROUNDS(key);
    567     const __m128i* sched = ((__m128i*)(key->rd_key));
    568     __m128i tmp = _mm_load_si128((__m128i*)in);
    569     tmp = _mm_xor_si128(tmp, sched[0]);
    570     for (j = 1; j < rnds; j++)
    571         tmp = _mm_aesdec_si128(tmp, sched[j]);
    572     tmp = _mm_aesdeclast_si128(tmp, sched[j]);
    573     _mm_store_si128((__m128i*)out, tmp);
    574 }
    575 
    576 static inline void AES_ecb_encrypt_blks(block* blks, unsigned nblks, AES_KEY* key) {
    577     unsigned i, j, rnds = ROUNDS(key);
    578     const __m128i* sched = ((__m128i*)(key->rd_key));
    579     for (i = 0; i < nblks; ++i)
    580         blks[i] = _mm_xor_si128(blks[i], sched[0]);
    581     for (j = 1; j < rnds; ++j)
    582         for (i = 0; i < nblks; ++i)
    583             blks[i] = _mm_aesenc_si128(blks[i], sched[j]);
    584     for (i = 0; i < nblks; ++i)
    585         blks[i] = _mm_aesenclast_si128(blks[i], sched[j]);
    586 }
    587 
    588 static inline void AES_ecb_decrypt_blks(block* blks, unsigned nblks, AES_KEY* key) {
    589     unsigned i, j, rnds = ROUNDS(key);
    590     const __m128i* sched = ((__m128i*)(key->rd_key));
    591     for (i = 0; i < nblks; ++i)
    592         blks[i] = _mm_xor_si128(blks[i], sched[0]);
    593     for (j = 1; j < rnds; ++j)
    594         for (i = 0; i < nblks; ++i)
    595             blks[i] = _mm_aesdec_si128(blks[i], sched[j]);
    596     for (i = 0; i < nblks; ++i)
    597         blks[i] = _mm_aesdeclast_si128(blks[i], sched[j]);
    598 }
    599 
    600 #define BPI 8 /* Number of blocks in buffer per ECB call   */
    601 /* Set to 4 for Westmere, 8 for Sandy Bridge */
    602 
    603 #endif
    604 
    605 /* ----------------------------------------------------------------------- */
    606 /* Define OCB context structure.                                           */
    607 /* ----------------------------------------------------------------------- */
    608 
    609 /*------------------------------------------------------------------------
    610 / Each item in the OCB context is stored either "memory correct" or
    611 / "register correct". On big-endian machines, this is identical. On
    612 / little-endian machines, one must choose whether the byte-string
    613 / is in the correct order when it resides in memory or in registers.
    614 / It must be register correct whenever it is to be manipulated
    615 / arithmetically, but must be memory correct whenever it interacts
    616 / with the plaintext or ciphertext.
    617 /------------------------------------------------------------------------- */
    618 
    619 struct _ae_ctx {
    620     block offset;        /* Memory correct               */
    621     block checksum;      /* Memory correct               */
    622     block Lstar;         /* Memory correct               */
    623     block Ldollar;       /* Memory correct               */
    624     block L[L_TABLE_SZ]; /* Memory correct               */
    625     block ad_checksum;   /* Memory correct               */
    626     block ad_offset;     /* Memory correct               */
    627     block cached_Top;    /* Memory correct               */
    628     uint64_t KtopStr[3]; /* Register correct, each item  */
    629     uint32_t ad_blocks_processed;
    630     uint32_t blocks_processed;
    631     AES_KEY decrypt_key;
    632     AES_KEY encrypt_key;
    633 #if (OCB_TAG_LEN == 0)
    634     unsigned tag_len;
    635 #endif
    636 };
    637 
    638 /* ----------------------------------------------------------------------- */
    639 /* L table lookup (or on-the-fly generation)                               */
    640 /* ----------------------------------------------------------------------- */
    641 
    642 #if L_TABLE_SZ_IS_ENOUGH
    643 #define getL(_ctx, _tz) ((_ctx)->L[_tz])
    644 #else
    645 static block getL(const ae_ctx* ctx, unsigned tz) {
    646     if (tz < L_TABLE_SZ)
    647         return ctx->L[tz];
    648     else {
    649         unsigned i;
    650         /* Bring L[MAX] into registers, make it register correct */
    651         block rval = swap_if_le(ctx->L[L_TABLE_SZ - 1]);
    652         rval = double_block(rval);
    653         for (i = L_TABLE_SZ; i < tz; i++)
    654             rval = double_block(rval);
    655         return swap_if_le(rval); /* To memory correct */
    656     }
    657 }
    658 #endif
    659 
    660 /* ----------------------------------------------------------------------- */
    661 /* Public functions                                                        */
    662 /* ----------------------------------------------------------------------- */
    663 
    664 /* 32-bit SSE2 and Altivec systems need to be forced to allocate memory
    665    on 16-byte alignments. (I believe all major 64-bit systems do already.) */
    666 
    667 ae_ctx* ae_allocate(void* misc) {
    668     void* p;
    669     (void)misc; /* misc unused in this implementation */
    670 #if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__)
    671     p = _mm_malloc(sizeof(ae_ctx), 16);
    672 #elif(__ALTIVEC__ && !__PPC64__)
    673     if (posix_memalign(&p, 16, sizeof(ae_ctx)) != 0)
    674         p = NULL;
    675 #else
    676     p = malloc(sizeof(ae_ctx));
    677 #endif
    678     return (ae_ctx*)p;
    679 }
    680 
    681 void ae_free(ae_ctx* ctx) {
    682 #if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__)
    683     _mm_free(ctx);
    684 #else
    685     free(ctx);
    686 #endif
    687 }
    688 
    689 /* ----------------------------------------------------------------------- */
    690 
    691 int ae_clear(ae_ctx* ctx) /* Zero ae_ctx and undo initialization          */
    692 {
    693     memset(ctx, 0, sizeof(ae_ctx));
    694     return AE_SUCCESS;
    695 }
    696 
    697 int ae_ctx_sizeof(void) {
    698     return (int)sizeof(ae_ctx);
    699 }
    700 
    701 /* ----------------------------------------------------------------------- */
    702 
    703 int ae_init(ae_ctx* ctx, const void* key, int key_len, int nonce_len, int tag_len) {
    704     unsigned i;
    705     block tmp_blk;
    706 
    707     if (nonce_len != 12)
    708         return AE_NOT_SUPPORTED;
    709 
    710 /* Initialize encryption & decryption keys */
    711 #if (OCB_KEY_LEN > 0)
    712     key_len = OCB_KEY_LEN;
    713 #endif
    714     AES_set_encrypt_key((unsigned char*)key, key_len * 8, &ctx->encrypt_key);
    715 #if USE_AES_NI
    716     AES_set_decrypt_key_fast(&ctx->decrypt_key, &ctx->encrypt_key);
    717 #else
    718     AES_set_decrypt_key((unsigned char*)key, (int)(key_len * 8), &ctx->decrypt_key);
    719 #endif
    720 
    721     /* Zero things that need zeroing */
    722     ctx->cached_Top = ctx->ad_checksum = zero_block();
    723     ctx->ad_blocks_processed = 0;
    724 
    725     /* Compute key-dependent values */
    726     AES_encrypt((unsigned char*)&ctx->cached_Top, (unsigned char*)&ctx->Lstar, &ctx->encrypt_key);
    727     tmp_blk = swap_if_le(ctx->Lstar);
    728     tmp_blk = double_block(tmp_blk);
    729     ctx->Ldollar = swap_if_le(tmp_blk);
    730     tmp_blk = double_block(tmp_blk);
    731     ctx->L[0] = swap_if_le(tmp_blk);
    732     for (i = 1; i < L_TABLE_SZ; i++) {
    733         tmp_blk = double_block(tmp_blk);
    734         ctx->L[i] = swap_if_le(tmp_blk);
    735     }
    736 
    737 #if (OCB_TAG_LEN == 0)
    738     ctx->tag_len = tag_len;
    739 #else
    740     (void)tag_len; /* Suppress var not used error */
    741 #endif
    742 
    743     return AE_SUCCESS;
    744 }
    745 
    746 /* ----------------------------------------------------------------------- */
    747 
    748 static block gen_offset_from_nonce(ae_ctx* ctx, const void* nonce) {
    749     const union {
    750         unsigned x;
    751         unsigned char endian;
    752     } little = {1};
    753     union {
    754         uint32_t u32[4];
    755         uint8_t u8[16];
    756         block bl;
    757     } tmp;
    758     unsigned idx;
    759 
    760 /* Replace cached nonce Top if needed */
    761 #if (OCB_TAG_LEN > 0)
    762     if (little.endian)
    763         tmp.u32[0] = 0x01000000 + ((OCB_TAG_LEN * 8 % 128) << 1);
    764     else
    765         tmp.u32[0] = 0x00000001 + ((OCB_TAG_LEN * 8 % 128) << 25);
    766 #else
    767     if (little.endian)
    768         tmp.u32[0] = 0x01000000 + ((ctx->tag_len * 8 % 128) << 1);
    769     else
    770         tmp.u32[0] = 0x00000001 + ((ctx->tag_len * 8 % 128) << 25);
    771 #endif
    772     tmp.u32[1] = ((uint32_t*)nonce)[0];
    773     tmp.u32[2] = ((uint32_t*)nonce)[1];
    774     tmp.u32[3] = ((uint32_t*)nonce)[2];
    775     idx = (unsigned)(tmp.u8[15] & 0x3f);           /* Get low 6 bits of nonce  */
    776     tmp.u8[15] = tmp.u8[15] & 0xc0;                /* Zero low 6 bits of nonce */
    777     if (unequal_blocks(tmp.bl, ctx->cached_Top)) { /* Cached?       */
    778         ctx->cached_Top = tmp.bl;                  /* Update cache, KtopStr    */
    779         AES_encrypt(tmp.u8, (unsigned char*)&ctx->KtopStr, &ctx->encrypt_key);
    780         if (little.endian) { /* Make Register Correct    */
    781             ctx->KtopStr[0] = bswap64(ctx->KtopStr[0]);
    782             ctx->KtopStr[1] = bswap64(ctx->KtopStr[1]);
    783         }
    784         ctx->KtopStr[2] = ctx->KtopStr[0] ^ (ctx->KtopStr[0] << 8) ^ (ctx->KtopStr[1] >> 56);
    785     }
    786     return gen_offset(ctx->KtopStr, idx);
    787 }
    788 
    789 static void process_ad(ae_ctx* ctx, const void* ad, int ad_len, int final) {
    790     union {
    791         uint32_t u32[4];
    792         uint8_t u8[16];
    793         block bl;
    794     } tmp;
    795     block ad_offset, ad_checksum;
    796     const block* adp = (block*)ad;
    797     unsigned i, k, tz, remaining;
    798 
    799     ad_offset = ctx->ad_offset;
    800     ad_checksum = ctx->ad_checksum;
    801     i = ad_len / (BPI * 16);
    802     if (i) {
    803         unsigned ad_block_num = ctx->ad_blocks_processed;
    804         do {
    805             block ta[BPI], oa[BPI];
    806             ad_block_num += BPI;
    807             tz = ntz(ad_block_num);
    808             oa[0] = xor_block(ad_offset, ctx->L[0]);
    809             ta[0] = xor_block(oa[0], adp[0]);
    810             oa[1] = xor_block(oa[0], ctx->L[1]);
    811             ta[1] = xor_block(oa[1], adp[1]);
    812             oa[2] = xor_block(ad_offset, ctx->L[1]);
    813             ta[2] = xor_block(oa[2], adp[2]);
    814 #if BPI == 4
    815             ad_offset = xor_block(oa[2], getL(ctx, tz));
    816             ta[3] = xor_block(ad_offset, adp[3]);
    817 #elif BPI == 8
    818             oa[3] = xor_block(oa[2], ctx->L[2]);
    819             ta[3] = xor_block(oa[3], adp[3]);
    820             oa[4] = xor_block(oa[1], ctx->L[2]);
    821             ta[4] = xor_block(oa[4], adp[4]);
    822             oa[5] = xor_block(oa[0], ctx->L[2]);
    823             ta[5] = xor_block(oa[5], adp[5]);
    824             oa[6] = xor_block(ad_offset, ctx->L[2]);
    825             ta[6] = xor_block(oa[6], adp[6]);
    826             ad_offset = xor_block(oa[6], getL(ctx, tz));
    827             ta[7] = xor_block(ad_offset, adp[7]);
    828 #endif
    829             AES_ecb_encrypt_blks(ta, BPI, &ctx->encrypt_key);
    830             ad_checksum = xor_block(ad_checksum, ta[0]);
    831             ad_checksum = xor_block(ad_checksum, ta[1]);
    832             ad_checksum = xor_block(ad_checksum, ta[2]);
    833             ad_checksum = xor_block(ad_checksum, ta[3]);
    834 #if (BPI == 8)
    835             ad_checksum = xor_block(ad_checksum, ta[4]);
    836             ad_checksum = xor_block(ad_checksum, ta[5]);
    837             ad_checksum = xor_block(ad_checksum, ta[6]);
    838             ad_checksum = xor_block(ad_checksum, ta[7]);
    839 #endif
    840             adp += BPI;
    841         } while (--i);
    842         ctx->ad_blocks_processed = ad_block_num;
    843         ctx->ad_offset = ad_offset;
    844         ctx->ad_checksum = ad_checksum;
    845     }
    846 
    847     if (final) {
    848         block ta[BPI];
    849 
    850         /* Process remaining associated data, compute its tag contribution */
    851         remaining = ((unsigned)ad_len) % (BPI * 16);
    852         if (remaining) {
    853             k = 0;
    854 #if (BPI == 8)
    855             if (remaining >= 64) {
    856                 tmp.bl = xor_block(ad_offset, ctx->L[0]);
    857                 ta[0] = xor_block(tmp.bl, adp[0]);
    858                 tmp.bl = xor_block(tmp.bl, ctx->L[1]);
    859                 ta[1] = xor_block(tmp.bl, adp[1]);
    860                 ad_offset = xor_block(ad_offset, ctx->L[1]);
    861                 ta[2] = xor_block(ad_offset, adp[2]);
    862                 ad_offset = xor_block(ad_offset, ctx->L[2]);
    863                 ta[3] = xor_block(ad_offset, adp[3]);
    864                 remaining -= 64;
    865                 k = 4;
    866             }
    867 #endif
    868             if (remaining >= 32) {
    869                 ad_offset = xor_block(ad_offset, ctx->L[0]);
    870                 ta[k] = xor_block(ad_offset, adp[k]);
    871                 ad_offset = xor_block(ad_offset, getL(ctx, ntz(k + 2)));
    872                 ta[k + 1] = xor_block(ad_offset, adp[k + 1]);
    873                 remaining -= 32;
    874                 k += 2;
    875             }
    876             if (remaining >= 16) {
    877                 ad_offset = xor_block(ad_offset, ctx->L[0]);
    878                 ta[k] = xor_block(ad_offset, adp[k]);
    879                 remaining = remaining - 16;
    880                 ++k;
    881             }
    882             if (remaining) {
    883                 ad_offset = xor_block(ad_offset, ctx->Lstar);
    884                 tmp.bl = zero_block();
    885                 memcpy(tmp.u8, adp + k, remaining);
    886                 tmp.u8[remaining] = (unsigned char)0x80u;
    887                 ta[k] = xor_block(ad_offset, tmp.bl);
    888                 ++k;
    889             }
    890             AES_ecb_encrypt_blks(ta, k, &ctx->encrypt_key);
    891             switch (k) {
    892 #if (BPI == 8)
    893             case 8:
    894                 ad_checksum = xor_block(ad_checksum, ta[7]);
    895             case 7:
    896                 ad_checksum = xor_block(ad_checksum, ta[6]);
    897             case 6:
    898                 ad_checksum = xor_block(ad_checksum, ta[5]);
    899             case 5:
    900                 ad_checksum = xor_block(ad_checksum, ta[4]);
    901 #endif
    902             case 4:
    903                 ad_checksum = xor_block(ad_checksum, ta[3]);
    904             case 3:
    905                 ad_checksum = xor_block(ad_checksum, ta[2]);
    906             case 2:
    907                 ad_checksum = xor_block(ad_checksum, ta[1]);
    908             case 1:
    909                 ad_checksum = xor_block(ad_checksum, ta[0]);
    910             }
    911             ctx->ad_checksum = ad_checksum;
    912         }
    913     }
    914 }
    915 
    916 /* ----------------------------------------------------------------------- */
    917 
    918 int ae_encrypt(ae_ctx* ctx, const void* nonce, const void* pt, int pt_len, const void* ad,
    919                int ad_len, void* ct, void* tag, int final) {
    920     union {
    921         uint32_t u32[4];
    922         uint8_t u8[16];
    923         block bl;
    924     } tmp;
    925     block offset, checksum;
    926     unsigned i, k;
    927     block* ctp = (block*)ct;
    928     const block* ptp = (block*)pt;
    929 
    930     /* Non-null nonce means start of new message, init per-message values */
    931     if (nonce) {
    932         ctx->offset = gen_offset_from_nonce(ctx, nonce);
    933         ctx->ad_offset = ctx->checksum = zero_block();
    934         ctx->ad_blocks_processed = ctx->blocks_processed = 0;
    935         if (ad_len >= 0)
    936             ctx->ad_checksum = zero_block();
    937     }
    938 
    939     /* Process associated data */
    940     if (ad_len > 0)
    941         process_ad(ctx, ad, ad_len, final);
    942 
    943     /* Encrypt plaintext data BPI blocks at a time */
    944     offset = ctx->offset;
    945     checksum = ctx->checksum;
    946     i = pt_len / (BPI * 16);
    947     if (i) {
    948         block oa[BPI];
    949         unsigned block_num = ctx->blocks_processed;
    950         oa[BPI - 1] = offset;
    951         do {
    952             block ta[BPI];
    953             block_num += BPI;
    954             oa[0] = xor_block(oa[BPI - 1], ctx->L[0]);
    955             ta[0] = xor_block(oa[0], ptp[0]);
    956             checksum = xor_block(checksum, ptp[0]);
    957             oa[1] = xor_block(oa[0], ctx->L[1]);
    958             ta[1] = xor_block(oa[1], ptp[1]);
    959             checksum = xor_block(checksum, ptp[1]);
    960             oa[2] = xor_block(oa[1], ctx->L[0]);
    961             ta[2] = xor_block(oa[2], ptp[2]);
    962             checksum = xor_block(checksum, ptp[2]);
    963 #if BPI == 4
    964             oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num)));
    965             ta[3] = xor_block(oa[3], ptp[3]);
    966             checksum = xor_block(checksum, ptp[3]);
    967 #elif BPI == 8
    968             oa[3] = xor_block(oa[2], ctx->L[2]);
    969             ta[3] = xor_block(oa[3], ptp[3]);
    970             checksum = xor_block(checksum, ptp[3]);
    971             oa[4] = xor_block(oa[1], ctx->L[2]);
    972             ta[4] = xor_block(oa[4], ptp[4]);
    973             checksum = xor_block(checksum, ptp[4]);
    974             oa[5] = xor_block(oa[0], ctx->L[2]);
    975             ta[5] = xor_block(oa[5], ptp[5]);
    976             checksum = xor_block(checksum, ptp[5]);
    977             oa[6] = xor_block(oa[7], ctx->L[2]);
    978             ta[6] = xor_block(oa[6], ptp[6]);
    979             checksum = xor_block(checksum, ptp[6]);
    980             oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num)));
    981             ta[7] = xor_block(oa[7], ptp[7]);
    982             checksum = xor_block(checksum, ptp[7]);
    983 #endif
    984             AES_ecb_encrypt_blks(ta, BPI, &ctx->encrypt_key);
    985             ctp[0] = xor_block(ta[0], oa[0]);
    986             ctp[1] = xor_block(ta[1], oa[1]);
    987             ctp[2] = xor_block(ta[2], oa[2]);
    988             ctp[3] = xor_block(ta[3], oa[3]);
    989 #if (BPI == 8)
    990             ctp[4] = xor_block(ta[4], oa[4]);
    991             ctp[5] = xor_block(ta[5], oa[5]);
    992             ctp[6] = xor_block(ta[6], oa[6]);
    993             ctp[7] = xor_block(ta[7], oa[7]);
    994 #endif
    995             ptp += BPI;
    996             ctp += BPI;
    997         } while (--i);
    998         ctx->offset = offset = oa[BPI - 1];
    999         ctx->blocks_processed = block_num;
   1000         ctx->checksum = checksum;
   1001     }
   1002 
   1003     if (final) {
   1004         block ta[BPI + 1], oa[BPI];
   1005 
   1006         /* Process remaining plaintext and compute its tag contribution    */
   1007         unsigned remaining = ((unsigned)pt_len) % (BPI * 16);
   1008         k = 0; /* How many blocks in ta[] need ECBing */
   1009         if (remaining) {
   1010 #if (BPI == 8)
   1011             if (remaining >= 64) {
   1012                 oa[0] = xor_block(offset, ctx->L[0]);
   1013                 ta[0] = xor_block(oa[0], ptp[0]);
   1014                 checksum = xor_block(checksum, ptp[0]);
   1015                 oa[1] = xor_block(oa[0], ctx->L[1]);
   1016                 ta[1] = xor_block(oa[1], ptp[1]);
   1017                 checksum = xor_block(checksum, ptp[1]);
   1018                 oa[2] = xor_block(oa[1], ctx->L[0]);
   1019                 ta[2] = xor_block(oa[2], ptp[2]);
   1020                 checksum = xor_block(checksum, ptp[2]);
   1021                 offset = oa[3] = xor_block(oa[2], ctx->L[2]);
   1022                 ta[3] = xor_block(offset, ptp[3]);
   1023                 checksum = xor_block(checksum, ptp[3]);
   1024                 remaining -= 64;
   1025                 k = 4;
   1026             }
   1027 #endif
   1028             if (remaining >= 32) {
   1029                 oa[k] = xor_block(offset, ctx->L[0]);
   1030                 ta[k] = xor_block(oa[k], ptp[k]);
   1031                 checksum = xor_block(checksum, ptp[k]);
   1032                 offset = oa[k + 1] = xor_block(oa[k], ctx->L[1]);
   1033                 ta[k + 1] = xor_block(offset, ptp[k + 1]);
   1034                 checksum = xor_block(checksum, ptp[k + 1]);
   1035                 remaining -= 32;
   1036                 k += 2;
   1037             }
   1038             if (remaining >= 16) {
   1039                 offset = oa[k] = xor_block(offset, ctx->L[0]);
   1040                 ta[k] = xor_block(offset, ptp[k]);
   1041                 checksum = xor_block(checksum, ptp[k]);
   1042                 remaining -= 16;
   1043                 ++k;
   1044             }
   1045             if (remaining) {
   1046                 tmp.bl = zero_block();
   1047                 memcpy(tmp.u8, ptp + k, remaining);
   1048                 tmp.u8[remaining] = (unsigned char)0x80u;
   1049                 checksum = xor_block(checksum, tmp.bl);
   1050                 ta[k] = offset = xor_block(offset, ctx->Lstar);
   1051                 ++k;
   1052             }
   1053         }
   1054         offset = xor_block(offset, ctx->Ldollar); /* Part of tag gen */
   1055         ta[k] = xor_block(offset, checksum);      /* Part of tag gen */
   1056         AES_ecb_encrypt_blks(ta, k + 1, &ctx->encrypt_key);
   1057         offset = xor_block(ta[k], ctx->ad_checksum); /* Part of tag gen */
   1058         if (remaining) {
   1059             --k;
   1060             tmp.bl = xor_block(tmp.bl, ta[k]);
   1061             memcpy(ctp + k, tmp.u8, remaining);
   1062         }
   1063         switch (k) {
   1064 #if (BPI == 8)
   1065         case 7:
   1066             ctp[6] = xor_block(ta[6], oa[6]);
   1067         case 6:
   1068             ctp[5] = xor_block(ta[5], oa[5]);
   1069         case 5:
   1070             ctp[4] = xor_block(ta[4], oa[4]);
   1071         case 4:
   1072             ctp[3] = xor_block(ta[3], oa[3]);
   1073 #endif
   1074         case 3:
   1075             ctp[2] = xor_block(ta[2], oa[2]);
   1076         case 2:
   1077             ctp[1] = xor_block(ta[1], oa[1]);
   1078         case 1:
   1079             ctp[0] = xor_block(ta[0], oa[0]);
   1080         }
   1081 
   1082         /* Tag is placed at the correct location
   1083          */
   1084         if (tag) {
   1085 #if (OCB_TAG_LEN == 16)
   1086             *(block*)tag = offset;
   1087 #elif(OCB_TAG_LEN > 0)
   1088             memcpy((char*)tag, &offset, OCB_TAG_LEN);
   1089 #else
   1090             memcpy((char*)tag, &offset, ctx->tag_len);
   1091 #endif
   1092         } else {
   1093 #if (OCB_TAG_LEN > 0)
   1094             memcpy((char*)ct + pt_len, &offset, OCB_TAG_LEN);
   1095             pt_len += OCB_TAG_LEN;
   1096 #else
   1097             memcpy((char*)ct + pt_len, &offset, ctx->tag_len);
   1098             pt_len += ctx->tag_len;
   1099 #endif
   1100         }
   1101     }
   1102     return (int)pt_len;
   1103 }
   1104 
   1105 /* ----------------------------------------------------------------------- */
   1106 
   1107 /* Compare two regions of memory, taking a constant amount of time for a
   1108    given buffer size -- under certain assumptions about the compiler
   1109    and machine, of course.
   1110 
   1111    Use this to avoid timing side-channel attacks.
   1112 
   1113    Returns 0 for memory regions with equal contents; non-zero otherwise. */
   1114 static int constant_time_memcmp(const void* av, const void* bv, size_t n) {
   1115     const uint8_t* a = (const uint8_t*)av;
   1116     const uint8_t* b = (const uint8_t*)bv;
   1117     uint8_t result = 0;
   1118     size_t i;
   1119 
   1120     for (i = 0; i < n; i++) {
   1121         result |= *a ^ *b;
   1122         a++;
   1123         b++;
   1124     }
   1125 
   1126     return (int)result;
   1127 }
   1128 
   1129 int ae_decrypt(ae_ctx* ctx, const void* nonce, const void* ct, int ct_len, const void* ad,
   1130                int ad_len, void* pt, const void* tag, int final) {
   1131     union {
   1132         uint32_t u32[4];
   1133         uint8_t u8[16];
   1134         block bl;
   1135     } tmp;
   1136     block offset, checksum;
   1137     unsigned i, k;
   1138     block* ctp = (block*)ct;
   1139     block* ptp = (block*)pt;
   1140 
   1141     /* Reduce ct_len tag bundled in ct */
   1142     if ((final) && (!tag))
   1143 #if (OCB_TAG_LEN > 0)
   1144         ct_len -= OCB_TAG_LEN;
   1145 #else
   1146         ct_len -= ctx->tag_len;
   1147 #endif
   1148 
   1149     /* Non-null nonce means start of new message, init per-message values */
   1150     if (nonce) {
   1151         ctx->offset = gen_offset_from_nonce(ctx, nonce);
   1152         ctx->ad_offset = ctx->checksum = zero_block();
   1153         ctx->ad_blocks_processed = ctx->blocks_processed = 0;
   1154         if (ad_len >= 0)
   1155             ctx->ad_checksum = zero_block();
   1156     }
   1157 
   1158     /* Process associated data */
   1159     if (ad_len > 0)
   1160         process_ad(ctx, ad, ad_len, final);
   1161 
   1162     /* Encrypt plaintext data BPI blocks at a time */
   1163     offset = ctx->offset;
   1164     checksum = ctx->checksum;
   1165     i = ct_len / (BPI * 16);
   1166     if (i) {
   1167         block oa[BPI];
   1168         unsigned block_num = ctx->blocks_processed;
   1169         oa[BPI - 1] = offset;
   1170         do {
   1171             block ta[BPI];
   1172             block_num += BPI;
   1173             oa[0] = xor_block(oa[BPI - 1], ctx->L[0]);
   1174             ta[0] = xor_block(oa[0], ctp[0]);
   1175             oa[1] = xor_block(oa[0], ctx->L[1]);
   1176             ta[1] = xor_block(oa[1], ctp[1]);
   1177             oa[2] = xor_block(oa[1], ctx->L[0]);
   1178             ta[2] = xor_block(oa[2], ctp[2]);
   1179 #if BPI == 4
   1180             oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num)));
   1181             ta[3] = xor_block(oa[3], ctp[3]);
   1182 #elif BPI == 8
   1183             oa[3] = xor_block(oa[2], ctx->L[2]);
   1184             ta[3] = xor_block(oa[3], ctp[3]);
   1185             oa[4] = xor_block(oa[1], ctx->L[2]);
   1186             ta[4] = xor_block(oa[4], ctp[4]);
   1187             oa[5] = xor_block(oa[0], ctx->L[2]);
   1188             ta[5] = xor_block(oa[5], ctp[5]);
   1189             oa[6] = xor_block(oa[7], ctx->L[2]);
   1190             ta[6] = xor_block(oa[6], ctp[6]);
   1191             oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num)));
   1192             ta[7] = xor_block(oa[7], ctp[7]);
   1193 #endif
   1194             AES_ecb_decrypt_blks(ta, BPI, &ctx->decrypt_key);
   1195             ptp[0] = xor_block(ta[0], oa[0]);
   1196             checksum = xor_block(checksum, ptp[0]);
   1197             ptp[1] = xor_block(ta[1], oa[1]);
   1198             checksum = xor_block(checksum, ptp[1]);
   1199             ptp[2] = xor_block(ta[2], oa[2]);
   1200             checksum = xor_block(checksum, ptp[2]);
   1201             ptp[3] = xor_block(ta[3], oa[3]);
   1202             checksum = xor_block(checksum, ptp[3]);
   1203 #if (BPI == 8)
   1204             ptp[4] = xor_block(ta[4], oa[4]);
   1205             checksum = xor_block(checksum, ptp[4]);
   1206             ptp[5] = xor_block(ta[5], oa[5]);
   1207             checksum = xor_block(checksum, ptp[5]);
   1208             ptp[6] = xor_block(ta[6], oa[6]);
   1209             checksum = xor_block(checksum, ptp[6]);
   1210             ptp[7] = xor_block(ta[7], oa[7]);
   1211             checksum = xor_block(checksum, ptp[7]);
   1212 #endif
   1213             ptp += BPI;
   1214             ctp += BPI;
   1215         } while (--i);
   1216         ctx->offset = offset = oa[BPI - 1];
   1217         ctx->blocks_processed = block_num;
   1218         ctx->checksum = checksum;
   1219     }
   1220 
   1221     if (final) {
   1222         block ta[BPI + 1], oa[BPI];
   1223 
   1224         /* Process remaining plaintext and compute its tag contribution    */
   1225         unsigned remaining = ((unsigned)ct_len) % (BPI * 16);
   1226         k = 0; /* How many blocks in ta[] need ECBing */
   1227         if (remaining) {
   1228 #if (BPI == 8)
   1229             if (remaining >= 64) {
   1230                 oa[0] = xor_block(offset, ctx->L[0]);
   1231                 ta[0] = xor_block(oa[0], ctp[0]);
   1232                 oa[1] = xor_block(oa[0], ctx->L[1]);
   1233                 ta[1] = xor_block(oa[1], ctp[1]);
   1234                 oa[2] = xor_block(oa[1], ctx->L[0]);
   1235                 ta[2] = xor_block(oa[2], ctp[2]);
   1236                 offset = oa[3] = xor_block(oa[2], ctx->L[2]);
   1237                 ta[3] = xor_block(offset, ctp[3]);
   1238                 remaining -= 64;
   1239                 k = 4;
   1240             }
   1241 #endif
   1242             if (remaining >= 32) {
   1243                 oa[k] = xor_block(offset, ctx->L[0]);
   1244                 ta[k] = xor_block(oa[k], ctp[k]);
   1245                 offset = oa[k + 1] = xor_block(oa[k], ctx->L[1]);
   1246                 ta[k + 1] = xor_block(offset, ctp[k + 1]);
   1247                 remaining -= 32;
   1248                 k += 2;
   1249             }
   1250             if (remaining >= 16) {
   1251                 offset = oa[k] = xor_block(offset, ctx->L[0]);
   1252                 ta[k] = xor_block(offset, ctp[k]);
   1253                 remaining -= 16;
   1254                 ++k;
   1255             }
   1256             if (remaining) {
   1257                 block pad;
   1258                 offset = xor_block(offset, ctx->Lstar);
   1259                 AES_encrypt((unsigned char*)&offset, tmp.u8, &ctx->encrypt_key);
   1260                 pad = tmp.bl;
   1261                 memcpy(tmp.u8, ctp + k, remaining);
   1262                 tmp.bl = xor_block(tmp.bl, pad);
   1263                 tmp.u8[remaining] = (unsigned char)0x80u;
   1264                 memcpy(ptp + k, tmp.u8, remaining);
   1265                 checksum = xor_block(checksum, tmp.bl);
   1266             }
   1267         }
   1268         AES_ecb_decrypt_blks(ta, k, &ctx->decrypt_key);
   1269         switch (k) {
   1270 #if (BPI == 8)
   1271         case 7:
   1272             ptp[6] = xor_block(ta[6], oa[6]);
   1273             checksum = xor_block(checksum, ptp[6]);
   1274         case 6:
   1275             ptp[5] = xor_block(ta[5], oa[5]);
   1276             checksum = xor_block(checksum, ptp[5]);
   1277         case 5:
   1278             ptp[4] = xor_block(ta[4], oa[4]);
   1279             checksum = xor_block(checksum, ptp[4]);
   1280         case 4:
   1281             ptp[3] = xor_block(ta[3], oa[3]);
   1282             checksum = xor_block(checksum, ptp[3]);
   1283 #endif
   1284         case 3:
   1285             ptp[2] = xor_block(ta[2], oa[2]);
   1286             checksum = xor_block(checksum, ptp[2]);
   1287         case 2:
   1288             ptp[1] = xor_block(ta[1], oa[1]);
   1289             checksum = xor_block(checksum, ptp[1]);
   1290         case 1:
   1291             ptp[0] = xor_block(ta[0], oa[0]);
   1292             checksum = xor_block(checksum, ptp[0]);
   1293         }
   1294 
   1295         /* Calculate expected tag */
   1296         offset = xor_block(offset, ctx->Ldollar);
   1297         tmp.bl = xor_block(offset, checksum);
   1298         AES_encrypt(tmp.u8, tmp.u8, &ctx->encrypt_key);
   1299         tmp.bl = xor_block(tmp.bl, ctx->ad_checksum); /* Full tag */
   1300 
   1301         /* Compare with proposed tag, change ct_len if invalid */
   1302         if ((OCB_TAG_LEN == 16) && tag) {
   1303             if (unequal_blocks(tmp.bl, *(block*)tag))
   1304                 ct_len = AE_INVALID;
   1305         } else {
   1306 #if (OCB_TAG_LEN > 0)
   1307             int len = OCB_TAG_LEN;
   1308 #else
   1309             int len = ctx->tag_len;
   1310 #endif
   1311             if (tag) {
   1312                 if (constant_time_memcmp(tag, tmp.u8, len) != 0)
   1313                     ct_len = AE_INVALID;
   1314             } else {
   1315                 if (constant_time_memcmp((char*)ct + ct_len, tmp.u8, len) != 0)
   1316                     ct_len = AE_INVALID;
   1317             }
   1318         }
   1319     }
   1320     return ct_len;
   1321 }
   1322 
   1323 /* ----------------------------------------------------------------------- */
   1324 /* Simple test program                                                     */
   1325 /* ----------------------------------------------------------------------- */
   1326 
   1327 #if 0
   1328 
   1329 #include <stdio.h>
   1330 #include <time.h>
   1331 
   1332 #if __GNUC__
   1333 #define ALIGN(n) __attribute__((aligned(n)))
   1334 #elif _MSC_VER
   1335 #define ALIGN(n) __declspec(align(n))
   1336 #else /* Not GNU/Microsoft: delete alignment uses.     */
   1337 #define ALIGN(n)
   1338 #endif
   1339 
   1340 static void pbuf(void *p, unsigned len, const void *s)
   1341 {
   1342     unsigned i;
   1343     if (s)
   1344         printf("%s", (char *)s);
   1345     for (i = 0; i < len; i++)
   1346         printf("%02X", (unsigned)(((unsigned char *)p)[i]));
   1347     printf("\n");
   1348 }
   1349 
   1350 static void vectors(ae_ctx *ctx, int len)
   1351 {
   1352     ALIGN(16) char pt[128];
   1353     ALIGN(16) char ct[144];
   1354     ALIGN(16) char nonce[] = {0,1,2,3,4,5,6,7,8,9,10,11};
   1355     int i;
   1356     for (i=0; i < 128; i++) pt[i] = i;
   1357     i = ae_encrypt(ctx,nonce,pt,len,pt,len,ct,NULL,AE_FINALIZE);
   1358     printf("P=%d,A=%d: ",len,len); pbuf(ct, i, NULL);
   1359     i = ae_encrypt(ctx,nonce,pt,0,pt,len,ct,NULL,AE_FINALIZE);
   1360     printf("P=%d,A=%d: ",0,len); pbuf(ct, i, NULL);
   1361     i = ae_encrypt(ctx,nonce,pt,len,pt,0,ct,NULL,AE_FINALIZE);
   1362     printf("P=%d,A=%d: ",len,0); pbuf(ct, i, NULL);
   1363 }
   1364 
   1365 void validate()
   1366 {
   1367     ALIGN(16) char pt[1024];
   1368     ALIGN(16) char ct[1024];
   1369     ALIGN(16) char tag[16];
   1370     ALIGN(16) char nonce[12] = {0,};
   1371     ALIGN(16) char key[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
   1372     ae_ctx ctx;
   1373     char *val_buf, *next;
   1374     int i, len;
   1375 
   1376     val_buf = (char *)malloc(22400 + 16);
   1377     next = val_buf = (char *)(((size_t)val_buf + 16) & ~((size_t)15));
   1378 
   1379     if (0) {
   1380 		ae_init(&ctx, key, 16, 12, 16);
   1381 		/* pbuf(&ctx, sizeof(ctx), "CTX: "); */
   1382 		vectors(&ctx,0);
   1383 		vectors(&ctx,8);
   1384 		vectors(&ctx,16);
   1385 		vectors(&ctx,24);
   1386 		vectors(&ctx,32);
   1387 		vectors(&ctx,40);
   1388     }
   1389 
   1390     memset(key,0,32);
   1391     memset(pt,0,128);
   1392     ae_init(&ctx, key, OCB_KEY_LEN, 12, OCB_TAG_LEN);
   1393 
   1394     /* RFC Vector test */
   1395     for (i = 0; i < 128; i++) {
   1396         int first = ((i/3)/(BPI*16))*(BPI*16);
   1397         int second = first;
   1398         int third = i - (first + second);
   1399 
   1400         nonce[11] = i;
   1401 
   1402         if (0) {
   1403             ae_encrypt(&ctx,nonce,pt,i,pt,i,ct,NULL,AE_FINALIZE);
   1404             memcpy(next,ct,(size_t)i+OCB_TAG_LEN);
   1405             next = next+i+OCB_TAG_LEN;
   1406 
   1407             ae_encrypt(&ctx,nonce,pt,i,pt,0,ct,NULL,AE_FINALIZE);
   1408             memcpy(next,ct,(size_t)i+OCB_TAG_LEN);
   1409             next = next+i+OCB_TAG_LEN;
   1410 
   1411             ae_encrypt(&ctx,nonce,pt,0,pt,i,ct,NULL,AE_FINALIZE);
   1412             memcpy(next,ct,OCB_TAG_LEN);
   1413             next = next+OCB_TAG_LEN;
   1414         } else {
   1415             ae_encrypt(&ctx,nonce,pt,first,pt,first,ct,NULL,AE_PENDING);
   1416             ae_encrypt(&ctx,NULL,pt+first,second,pt+first,second,ct+first,NULL,AE_PENDING);
   1417             ae_encrypt(&ctx,NULL,pt+first+second,third,pt+first+second,third,ct+first+second,NULL,AE_FINALIZE);
   1418             memcpy(next,ct,(size_t)i+OCB_TAG_LEN);
   1419             next = next+i+OCB_TAG_LEN;
   1420 
   1421             ae_encrypt(&ctx,nonce,pt,first,pt,0,ct,NULL,AE_PENDING);
   1422             ae_encrypt(&ctx,NULL,pt+first,second,pt,0,ct+first,NULL,AE_PENDING);
   1423             ae_encrypt(&ctx,NULL,pt+first+second,third,pt,0,ct+first+second,NULL,AE_FINALIZE);
   1424             memcpy(next,ct,(size_t)i+OCB_TAG_LEN);
   1425             next = next+i+OCB_TAG_LEN;
   1426 
   1427             ae_encrypt(&ctx,nonce,pt,0,pt,first,ct,NULL,AE_PENDING);
   1428             ae_encrypt(&ctx,NULL,pt,0,pt+first,second,ct,NULL,AE_PENDING);
   1429             ae_encrypt(&ctx,NULL,pt,0,pt+first+second,third,ct,NULL,AE_FINALIZE);
   1430             memcpy(next,ct,OCB_TAG_LEN);
   1431             next = next+OCB_TAG_LEN;
   1432         }
   1433 
   1434     }
   1435     nonce[11] = 0;
   1436     ae_encrypt(&ctx,nonce,NULL,0,val_buf,next-val_buf,ct,tag,AE_FINALIZE);
   1437     pbuf(tag,OCB_TAG_LEN,0);
   1438 
   1439 
   1440     /* Encrypt/Decrypt test */
   1441     for (i = 0; i < 128; i++) {
   1442         int first = ((i/3)/(BPI*16))*(BPI*16);
   1443         int second = first;
   1444         int third = i - (first + second);
   1445 
   1446         nonce[11] = i%128;
   1447 
   1448         if (1) {
   1449             len = ae_encrypt(&ctx,nonce,val_buf,i,val_buf,i,ct,tag,AE_FINALIZE);
   1450             len = ae_encrypt(&ctx,nonce,val_buf,i,val_buf,-1,ct,tag,AE_FINALIZE);
   1451             len = ae_decrypt(&ctx,nonce,ct,len,val_buf,-1,pt,tag,AE_FINALIZE);
   1452             if (len == -1) { printf("Authentication error: %d\n", i); return; }
   1453             if (len != i) { printf("Length error: %d\n", i); return; }
   1454             if (memcmp(val_buf,pt,i)) { printf("Decrypt error: %d\n", i); return; }
   1455         } else {
   1456             len = ae_encrypt(&ctx,nonce,val_buf,i,val_buf,i,ct,NULL,AE_FINALIZE);
   1457             ae_decrypt(&ctx,nonce,ct,first,val_buf,first,pt,NULL,AE_PENDING);
   1458             ae_decrypt(&ctx,NULL,ct+first,second,val_buf+first,second,pt+first,NULL,AE_PENDING);
   1459             len = ae_decrypt(&ctx,NULL,ct+first+second,len-(first+second),val_buf+first+second,third,pt+first+second,NULL,AE_FINALIZE);
   1460             if (len == -1) { printf("Authentication error: %d\n", i); return; }
   1461             if (memcmp(val_buf,pt,i)) { printf("Decrypt error: %d\n", i); return; }
   1462         }
   1463 
   1464     }
   1465     printf("Decrypt: PASS\n");
   1466 }
   1467 
   1468 int main()
   1469 {
   1470     validate();
   1471     return 0;
   1472 }
   1473 #endif
   1474 
   1475 #if USE_AES_NI
   1476 char infoString[] = "OCB3 (AES-NI)";
   1477 #elif USE_REFERENCE_AES
   1478 char infoString[] = "OCB3 (Reference)";
   1479 #elif USE_OPENSSL_AES
   1480 char infoString[] = "OCB3 (OpenSSL)";
   1481 #endif
   1482