Home | History | Annotate | Download | only in poly1305
      1 /* Copyright (c) 2014, Google Inc.
      2  *
      3  * Permission to use, copy, modify, and/or distribute this software for any
      4  * purpose with or without fee is hereby granted, provided that the above
      5  * copyright notice and this permission notice appear in all copies.
      6  *
      7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
     14 
     15 // This implementation of poly1305 is by Andrew Moon
     16 // (https://github.com/floodyberry/poly1305-donna) and released as public
     17 // domain. It implements SIMD vectorization based on the algorithm described in
     18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
     19 // block size
     20 
     21 #include <openssl/poly1305.h>
     22 
     23 #include "../internal.h"
     24 
     25 
     26 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
     27 
     28 #include <emmintrin.h>
     29 
     30 #define U8TO64_LE(m) (*(const uint64_t *)(m))
     31 #define U8TO32_LE(m) (*(const uint32_t *)(m))
     32 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
     33 
     34 typedef __m128i xmmi;
     35 
     36 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
     37     (1 << 26) - 1, 0, (1 << 26) - 1, 0};
     38 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
     39 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
     40     (1 << 24), 0, (1 << 24), 0};
     41 
     42 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
     43 
     44 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
     45 
     46 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
     47   return (uint128_t)a * b;
     48 }
     49 
     50 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
     51 
     52 static inline uint64_t shr128(uint128_t v, const int shift) {
     53   return (uint64_t)(v >> shift);
     54 }
     55 
     56 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
     57   return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
     58 }
     59 
     60 typedef struct poly1305_power_t {
     61   union {
     62     xmmi v;
     63     uint64_t u[2];
     64     uint32_t d[4];
     65   } R20, R21, R22, R23, R24, S21, S22, S23, S24;
     66 } poly1305_power;
     67 
     68 typedef struct poly1305_state_internal_t {
     69   poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
     70                           bytes of free storage */
     71   union {
     72     xmmi H[5];  //  80 bytes
     73     uint64_t HH[10];
     74   };
     75   // uint64_t r0,r1,r2;       [24 bytes]
     76   // uint64_t pad0,pad1;      [16 bytes]
     77   uint64_t started;        //   8 bytes
     78   uint64_t leftover;       //   8 bytes
     79   uint8_t buffer[64];      //  64 bytes
     80 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
     81                               alignment = 511 bytes raw */
     82 
     83 static inline poly1305_state_internal *poly1305_aligned_state(
     84     poly1305_state *state) {
     85   return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
     86 }
     87 
     88 static inline size_t poly1305_min(size_t a, size_t b) {
     89   return (a < b) ? a : b;
     90 }
     91 
     92 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
     93   poly1305_state_internal *st = poly1305_aligned_state(state);
     94   poly1305_power *p;
     95   uint64_t r0, r1, r2;
     96   uint64_t t0, t1;
     97 
     98   // clamp key
     99   t0 = U8TO64_LE(key + 0);
    100   t1 = U8TO64_LE(key + 8);
    101   r0 = t0 & 0xffc0fffffff;
    102   t0 >>= 44;
    103   t0 |= t1 << 20;
    104   r1 = t0 & 0xfffffc0ffff;
    105   t1 >>= 24;
    106   r2 = t1 & 0x00ffffffc0f;
    107 
    108   // store r in un-used space of st->P[1]
    109   p = &st->P[1];
    110   p->R20.d[1] = (uint32_t)(r0);
    111   p->R20.d[3] = (uint32_t)(r0 >> 32);
    112   p->R21.d[1] = (uint32_t)(r1);
    113   p->R21.d[3] = (uint32_t)(r1 >> 32);
    114   p->R22.d[1] = (uint32_t)(r2);
    115   p->R22.d[3] = (uint32_t)(r2 >> 32);
    116 
    117   // store pad
    118   p->R23.d[1] = U8TO32_LE(key + 16);
    119   p->R23.d[3] = U8TO32_LE(key + 20);
    120   p->R24.d[1] = U8TO32_LE(key + 24);
    121   p->R24.d[3] = U8TO32_LE(key + 28);
    122 
    123   // H = 0
    124   st->H[0] = _mm_setzero_si128();
    125   st->H[1] = _mm_setzero_si128();
    126   st->H[2] = _mm_setzero_si128();
    127   st->H[3] = _mm_setzero_si128();
    128   st->H[4] = _mm_setzero_si128();
    129 
    130   st->started = 0;
    131   st->leftover = 0;
    132 }
    133 
    134 static void poly1305_first_block(poly1305_state_internal *st,
    135                                  const uint8_t *m) {
    136   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
    137   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
    138   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
    139   xmmi T5, T6;
    140   poly1305_power *p;
    141   uint128_t d[3];
    142   uint64_t r0, r1, r2;
    143   uint64_t r20, r21, r22, s22;
    144   uint64_t pad0, pad1;
    145   uint64_t c;
    146   uint64_t i;
    147 
    148   // pull out stored info
    149   p = &st->P[1];
    150 
    151   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
    152   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
    153   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
    154   pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
    155   pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
    156 
    157   // compute powers r^2,r^4
    158   r20 = r0;
    159   r21 = r1;
    160   r22 = r2;
    161   for (i = 0; i < 2; i++) {
    162     s22 = r22 * (5 << 2);
    163 
    164     d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
    165     d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
    166     d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
    167 
    168     r20 = lo128(d[0]) & 0xfffffffffff;
    169     c = shr128(d[0], 44);
    170     d[1] = add128_64(d[1], c);
    171     r21 = lo128(d[1]) & 0xfffffffffff;
    172     c = shr128(d[1], 44);
    173     d[2] = add128_64(d[2], c);
    174     r22 = lo128(d[2]) & 0x3ffffffffff;
    175     c = shr128(d[2], 42);
    176     r20 += c * 5;
    177     c = (r20 >> 44);
    178     r20 = r20 & 0xfffffffffff;
    179     r21 += c;
    180 
    181     p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
    182                                  _MM_SHUFFLE(1, 0, 1, 0));
    183     p->R21.v = _mm_shuffle_epi32(
    184         _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
    185         _MM_SHUFFLE(1, 0, 1, 0));
    186     p->R22.v =
    187         _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
    188                           _MM_SHUFFLE(1, 0, 1, 0));
    189     p->R23.v = _mm_shuffle_epi32(
    190         _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
    191         _MM_SHUFFLE(1, 0, 1, 0));
    192     p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
    193                                  _MM_SHUFFLE(1, 0, 1, 0));
    194     p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
    195     p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
    196     p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
    197     p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
    198     p--;
    199   }
    200 
    201   // put saved info back
    202   p = &st->P[1];
    203   p->R20.d[1] = (uint32_t)(r0);
    204   p->R20.d[3] = (uint32_t)(r0 >> 32);
    205   p->R21.d[1] = (uint32_t)(r1);
    206   p->R21.d[3] = (uint32_t)(r1 >> 32);
    207   p->R22.d[1] = (uint32_t)(r2);
    208   p->R22.d[3] = (uint32_t)(r2 >> 32);
    209   p->R23.d[1] = (uint32_t)(pad0);
    210   p->R23.d[3] = (uint32_t)(pad0 >> 32);
    211   p->R24.d[1] = (uint32_t)(pad1);
    212   p->R24.d[3] = (uint32_t)(pad1 >> 32);
    213 
    214   // H = [Mx,My]
    215   T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
    216                           _mm_loadl_epi64((const xmmi *)(m + 16)));
    217   T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
    218                           _mm_loadl_epi64((const xmmi *)(m + 24)));
    219   st->H[0] = _mm_and_si128(MMASK, T5);
    220   st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    221   T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    222   st->H[2] = _mm_and_si128(MMASK, T5);
    223   st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    224   st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
    225 }
    226 
    227 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
    228                             size_t bytes) {
    229   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
    230   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
    231   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
    232 
    233   poly1305_power *p;
    234   xmmi H0, H1, H2, H3, H4;
    235   xmmi T0, T1, T2, T3, T4, T5, T6;
    236   xmmi M0, M1, M2, M3, M4;
    237   xmmi C1, C2;
    238 
    239   H0 = st->H[0];
    240   H1 = st->H[1];
    241   H2 = st->H[2];
    242   H3 = st->H[3];
    243   H4 = st->H[4];
    244 
    245   while (bytes >= 64) {
    246     // H *= [r^4,r^4]
    247     p = &st->P[0];
    248     T0 = _mm_mul_epu32(H0, p->R20.v);
    249     T1 = _mm_mul_epu32(H0, p->R21.v);
    250     T2 = _mm_mul_epu32(H0, p->R22.v);
    251     T3 = _mm_mul_epu32(H0, p->R23.v);
    252     T4 = _mm_mul_epu32(H0, p->R24.v);
    253     T5 = _mm_mul_epu32(H1, p->S24.v);
    254     T6 = _mm_mul_epu32(H1, p->R20.v);
    255     T0 = _mm_add_epi64(T0, T5);
    256     T1 = _mm_add_epi64(T1, T6);
    257     T5 = _mm_mul_epu32(H2, p->S23.v);
    258     T6 = _mm_mul_epu32(H2, p->S24.v);
    259     T0 = _mm_add_epi64(T0, T5);
    260     T1 = _mm_add_epi64(T1, T6);
    261     T5 = _mm_mul_epu32(H3, p->S22.v);
    262     T6 = _mm_mul_epu32(H3, p->S23.v);
    263     T0 = _mm_add_epi64(T0, T5);
    264     T1 = _mm_add_epi64(T1, T6);
    265     T5 = _mm_mul_epu32(H4, p->S21.v);
    266     T6 = _mm_mul_epu32(H4, p->S22.v);
    267     T0 = _mm_add_epi64(T0, T5);
    268     T1 = _mm_add_epi64(T1, T6);
    269     T5 = _mm_mul_epu32(H1, p->R21.v);
    270     T6 = _mm_mul_epu32(H1, p->R22.v);
    271     T2 = _mm_add_epi64(T2, T5);
    272     T3 = _mm_add_epi64(T3, T6);
    273     T5 = _mm_mul_epu32(H2, p->R20.v);
    274     T6 = _mm_mul_epu32(H2, p->R21.v);
    275     T2 = _mm_add_epi64(T2, T5);
    276     T3 = _mm_add_epi64(T3, T6);
    277     T5 = _mm_mul_epu32(H3, p->S24.v);
    278     T6 = _mm_mul_epu32(H3, p->R20.v);
    279     T2 = _mm_add_epi64(T2, T5);
    280     T3 = _mm_add_epi64(T3, T6);
    281     T5 = _mm_mul_epu32(H4, p->S23.v);
    282     T6 = _mm_mul_epu32(H4, p->S24.v);
    283     T2 = _mm_add_epi64(T2, T5);
    284     T3 = _mm_add_epi64(T3, T6);
    285     T5 = _mm_mul_epu32(H1, p->R23.v);
    286     T4 = _mm_add_epi64(T4, T5);
    287     T5 = _mm_mul_epu32(H2, p->R22.v);
    288     T4 = _mm_add_epi64(T4, T5);
    289     T5 = _mm_mul_epu32(H3, p->R21.v);
    290     T4 = _mm_add_epi64(T4, T5);
    291     T5 = _mm_mul_epu32(H4, p->R20.v);
    292     T4 = _mm_add_epi64(T4, T5);
    293 
    294     // H += [Mx,My]*[r^2,r^2]
    295     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
    296                             _mm_loadl_epi64((const xmmi *)(m + 16)));
    297     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
    298                             _mm_loadl_epi64((const xmmi *)(m + 24)));
    299     M0 = _mm_and_si128(MMASK, T5);
    300     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    301     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    302     M2 = _mm_and_si128(MMASK, T5);
    303     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    304     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
    305 
    306     p = &st->P[1];
    307     T5 = _mm_mul_epu32(M0, p->R20.v);
    308     T6 = _mm_mul_epu32(M0, p->R21.v);
    309     T0 = _mm_add_epi64(T0, T5);
    310     T1 = _mm_add_epi64(T1, T6);
    311     T5 = _mm_mul_epu32(M1, p->S24.v);
    312     T6 = _mm_mul_epu32(M1, p->R20.v);
    313     T0 = _mm_add_epi64(T0, T5);
    314     T1 = _mm_add_epi64(T1, T6);
    315     T5 = _mm_mul_epu32(M2, p->S23.v);
    316     T6 = _mm_mul_epu32(M2, p->S24.v);
    317     T0 = _mm_add_epi64(T0, T5);
    318     T1 = _mm_add_epi64(T1, T6);
    319     T5 = _mm_mul_epu32(M3, p->S22.v);
    320     T6 = _mm_mul_epu32(M3, p->S23.v);
    321     T0 = _mm_add_epi64(T0, T5);
    322     T1 = _mm_add_epi64(T1, T6);
    323     T5 = _mm_mul_epu32(M4, p->S21.v);
    324     T6 = _mm_mul_epu32(M4, p->S22.v);
    325     T0 = _mm_add_epi64(T0, T5);
    326     T1 = _mm_add_epi64(T1, T6);
    327     T5 = _mm_mul_epu32(M0, p->R22.v);
    328     T6 = _mm_mul_epu32(M0, p->R23.v);
    329     T2 = _mm_add_epi64(T2, T5);
    330     T3 = _mm_add_epi64(T3, T6);
    331     T5 = _mm_mul_epu32(M1, p->R21.v);
    332     T6 = _mm_mul_epu32(M1, p->R22.v);
    333     T2 = _mm_add_epi64(T2, T5);
    334     T3 = _mm_add_epi64(T3, T6);
    335     T5 = _mm_mul_epu32(M2, p->R20.v);
    336     T6 = _mm_mul_epu32(M2, p->R21.v);
    337     T2 = _mm_add_epi64(T2, T5);
    338     T3 = _mm_add_epi64(T3, T6);
    339     T5 = _mm_mul_epu32(M3, p->S24.v);
    340     T6 = _mm_mul_epu32(M3, p->R20.v);
    341     T2 = _mm_add_epi64(T2, T5);
    342     T3 = _mm_add_epi64(T3, T6);
    343     T5 = _mm_mul_epu32(M4, p->S23.v);
    344     T6 = _mm_mul_epu32(M4, p->S24.v);
    345     T2 = _mm_add_epi64(T2, T5);
    346     T3 = _mm_add_epi64(T3, T6);
    347     T5 = _mm_mul_epu32(M0, p->R24.v);
    348     T4 = _mm_add_epi64(T4, T5);
    349     T5 = _mm_mul_epu32(M1, p->R23.v);
    350     T4 = _mm_add_epi64(T4, T5);
    351     T5 = _mm_mul_epu32(M2, p->R22.v);
    352     T4 = _mm_add_epi64(T4, T5);
    353     T5 = _mm_mul_epu32(M3, p->R21.v);
    354     T4 = _mm_add_epi64(T4, T5);
    355     T5 = _mm_mul_epu32(M4, p->R20.v);
    356     T4 = _mm_add_epi64(T4, T5);
    357 
    358     // H += [Mx,My]
    359     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
    360                             _mm_loadl_epi64((const xmmi *)(m + 48)));
    361     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
    362                             _mm_loadl_epi64((const xmmi *)(m + 56)));
    363     M0 = _mm_and_si128(MMASK, T5);
    364     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    365     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    366     M2 = _mm_and_si128(MMASK, T5);
    367     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    368     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
    369 
    370     T0 = _mm_add_epi64(T0, M0);
    371     T1 = _mm_add_epi64(T1, M1);
    372     T2 = _mm_add_epi64(T2, M2);
    373     T3 = _mm_add_epi64(T3, M3);
    374     T4 = _mm_add_epi64(T4, M4);
    375 
    376     // reduce
    377     C1 = _mm_srli_epi64(T0, 26);
    378     C2 = _mm_srli_epi64(T3, 26);
    379     T0 = _mm_and_si128(T0, MMASK);
    380     T3 = _mm_and_si128(T3, MMASK);
    381     T1 = _mm_add_epi64(T1, C1);
    382     T4 = _mm_add_epi64(T4, C2);
    383     C1 = _mm_srli_epi64(T1, 26);
    384     C2 = _mm_srli_epi64(T4, 26);
    385     T1 = _mm_and_si128(T1, MMASK);
    386     T4 = _mm_and_si128(T4, MMASK);
    387     T2 = _mm_add_epi64(T2, C1);
    388     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    389     C1 = _mm_srli_epi64(T2, 26);
    390     C2 = _mm_srli_epi64(T0, 26);
    391     T2 = _mm_and_si128(T2, MMASK);
    392     T0 = _mm_and_si128(T0, MMASK);
    393     T3 = _mm_add_epi64(T3, C1);
    394     T1 = _mm_add_epi64(T1, C2);
    395     C1 = _mm_srli_epi64(T3, 26);
    396     T3 = _mm_and_si128(T3, MMASK);
    397     T4 = _mm_add_epi64(T4, C1);
    398 
    399     // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
    400     H0 = T0;
    401     H1 = T1;
    402     H2 = T2;
    403     H3 = T3;
    404     H4 = T4;
    405 
    406     m += 64;
    407     bytes -= 64;
    408   }
    409 
    410   st->H[0] = H0;
    411   st->H[1] = H1;
    412   st->H[2] = H2;
    413   st->H[3] = H3;
    414   st->H[4] = H4;
    415 }
    416 
    417 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
    418                                size_t bytes) {
    419   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
    420   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
    421   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
    422 
    423   poly1305_power *p;
    424   xmmi H0, H1, H2, H3, H4;
    425   xmmi M0, M1, M2, M3, M4;
    426   xmmi T0, T1, T2, T3, T4, T5, T6;
    427   xmmi C1, C2;
    428 
    429   uint64_t r0, r1, r2;
    430   uint64_t t0, t1, t2, t3, t4;
    431   uint64_t c;
    432   size_t consumed = 0;
    433 
    434   H0 = st->H[0];
    435   H1 = st->H[1];
    436   H2 = st->H[2];
    437   H3 = st->H[3];
    438   H4 = st->H[4];
    439 
    440   // p = [r^2,r^2]
    441   p = &st->P[1];
    442 
    443   if (bytes >= 32) {
    444     // H *= [r^2,r^2]
    445     T0 = _mm_mul_epu32(H0, p->R20.v);
    446     T1 = _mm_mul_epu32(H0, p->R21.v);
    447     T2 = _mm_mul_epu32(H0, p->R22.v);
    448     T3 = _mm_mul_epu32(H0, p->R23.v);
    449     T4 = _mm_mul_epu32(H0, p->R24.v);
    450     T5 = _mm_mul_epu32(H1, p->S24.v);
    451     T6 = _mm_mul_epu32(H1, p->R20.v);
    452     T0 = _mm_add_epi64(T0, T5);
    453     T1 = _mm_add_epi64(T1, T6);
    454     T5 = _mm_mul_epu32(H2, p->S23.v);
    455     T6 = _mm_mul_epu32(H2, p->S24.v);
    456     T0 = _mm_add_epi64(T0, T5);
    457     T1 = _mm_add_epi64(T1, T6);
    458     T5 = _mm_mul_epu32(H3, p->S22.v);
    459     T6 = _mm_mul_epu32(H3, p->S23.v);
    460     T0 = _mm_add_epi64(T0, T5);
    461     T1 = _mm_add_epi64(T1, T6);
    462     T5 = _mm_mul_epu32(H4, p->S21.v);
    463     T6 = _mm_mul_epu32(H4, p->S22.v);
    464     T0 = _mm_add_epi64(T0, T5);
    465     T1 = _mm_add_epi64(T1, T6);
    466     T5 = _mm_mul_epu32(H1, p->R21.v);
    467     T6 = _mm_mul_epu32(H1, p->R22.v);
    468     T2 = _mm_add_epi64(T2, T5);
    469     T3 = _mm_add_epi64(T3, T6);
    470     T5 = _mm_mul_epu32(H2, p->R20.v);
    471     T6 = _mm_mul_epu32(H2, p->R21.v);
    472     T2 = _mm_add_epi64(T2, T5);
    473     T3 = _mm_add_epi64(T3, T6);
    474     T5 = _mm_mul_epu32(H3, p->S24.v);
    475     T6 = _mm_mul_epu32(H3, p->R20.v);
    476     T2 = _mm_add_epi64(T2, T5);
    477     T3 = _mm_add_epi64(T3, T6);
    478     T5 = _mm_mul_epu32(H4, p->S23.v);
    479     T6 = _mm_mul_epu32(H4, p->S24.v);
    480     T2 = _mm_add_epi64(T2, T5);
    481     T3 = _mm_add_epi64(T3, T6);
    482     T5 = _mm_mul_epu32(H1, p->R23.v);
    483     T4 = _mm_add_epi64(T4, T5);
    484     T5 = _mm_mul_epu32(H2, p->R22.v);
    485     T4 = _mm_add_epi64(T4, T5);
    486     T5 = _mm_mul_epu32(H3, p->R21.v);
    487     T4 = _mm_add_epi64(T4, T5);
    488     T5 = _mm_mul_epu32(H4, p->R20.v);
    489     T4 = _mm_add_epi64(T4, T5);
    490 
    491     // H += [Mx,My]
    492     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
    493                             _mm_loadl_epi64((const xmmi *)(m + 16)));
    494     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
    495                             _mm_loadl_epi64((const xmmi *)(m + 24)));
    496     M0 = _mm_and_si128(MMASK, T5);
    497     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    498     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    499     M2 = _mm_and_si128(MMASK, T5);
    500     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    501     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
    502 
    503     T0 = _mm_add_epi64(T0, M0);
    504     T1 = _mm_add_epi64(T1, M1);
    505     T2 = _mm_add_epi64(T2, M2);
    506     T3 = _mm_add_epi64(T3, M3);
    507     T4 = _mm_add_epi64(T4, M4);
    508 
    509     // reduce
    510     C1 = _mm_srli_epi64(T0, 26);
    511     C2 = _mm_srli_epi64(T3, 26);
    512     T0 = _mm_and_si128(T0, MMASK);
    513     T3 = _mm_and_si128(T3, MMASK);
    514     T1 = _mm_add_epi64(T1, C1);
    515     T4 = _mm_add_epi64(T4, C2);
    516     C1 = _mm_srli_epi64(T1, 26);
    517     C2 = _mm_srli_epi64(T4, 26);
    518     T1 = _mm_and_si128(T1, MMASK);
    519     T4 = _mm_and_si128(T4, MMASK);
    520     T2 = _mm_add_epi64(T2, C1);
    521     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    522     C1 = _mm_srli_epi64(T2, 26);
    523     C2 = _mm_srli_epi64(T0, 26);
    524     T2 = _mm_and_si128(T2, MMASK);
    525     T0 = _mm_and_si128(T0, MMASK);
    526     T3 = _mm_add_epi64(T3, C1);
    527     T1 = _mm_add_epi64(T1, C2);
    528     C1 = _mm_srli_epi64(T3, 26);
    529     T3 = _mm_and_si128(T3, MMASK);
    530     T4 = _mm_add_epi64(T4, C1);
    531 
    532     // H = (H*[r^2,r^2] + [Mx,My])
    533     H0 = T0;
    534     H1 = T1;
    535     H2 = T2;
    536     H3 = T3;
    537     H4 = T4;
    538 
    539     consumed = 32;
    540   }
    541 
    542   // finalize, H *= [r^2,r]
    543   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
    544   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
    545   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
    546 
    547   p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
    548   p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
    549   p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
    550   p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
    551   p->R24.d[2] = (uint32_t)((r2 >> 16));
    552   p->S21.d[2] = p->R21.d[2] * 5;
    553   p->S22.d[2] = p->R22.d[2] * 5;
    554   p->S23.d[2] = p->R23.d[2] * 5;
    555   p->S24.d[2] = p->R24.d[2] * 5;
    556 
    557   // H *= [r^2,r]
    558   T0 = _mm_mul_epu32(H0, p->R20.v);
    559   T1 = _mm_mul_epu32(H0, p->R21.v);
    560   T2 = _mm_mul_epu32(H0, p->R22.v);
    561   T3 = _mm_mul_epu32(H0, p->R23.v);
    562   T4 = _mm_mul_epu32(H0, p->R24.v);
    563   T5 = _mm_mul_epu32(H1, p->S24.v);
    564   T6 = _mm_mul_epu32(H1, p->R20.v);
    565   T0 = _mm_add_epi64(T0, T5);
    566   T1 = _mm_add_epi64(T1, T6);
    567   T5 = _mm_mul_epu32(H2, p->S23.v);
    568   T6 = _mm_mul_epu32(H2, p->S24.v);
    569   T0 = _mm_add_epi64(T0, T5);
    570   T1 = _mm_add_epi64(T1, T6);
    571   T5 = _mm_mul_epu32(H3, p->S22.v);
    572   T6 = _mm_mul_epu32(H3, p->S23.v);
    573   T0 = _mm_add_epi64(T0, T5);
    574   T1 = _mm_add_epi64(T1, T6);
    575   T5 = _mm_mul_epu32(H4, p->S21.v);
    576   T6 = _mm_mul_epu32(H4, p->S22.v);
    577   T0 = _mm_add_epi64(T0, T5);
    578   T1 = _mm_add_epi64(T1, T6);
    579   T5 = _mm_mul_epu32(H1, p->R21.v);
    580   T6 = _mm_mul_epu32(H1, p->R22.v);
    581   T2 = _mm_add_epi64(T2, T5);
    582   T3 = _mm_add_epi64(T3, T6);
    583   T5 = _mm_mul_epu32(H2, p->R20.v);
    584   T6 = _mm_mul_epu32(H2, p->R21.v);
    585   T2 = _mm_add_epi64(T2, T5);
    586   T3 = _mm_add_epi64(T3, T6);
    587   T5 = _mm_mul_epu32(H3, p->S24.v);
    588   T6 = _mm_mul_epu32(H3, p->R20.v);
    589   T2 = _mm_add_epi64(T2, T5);
    590   T3 = _mm_add_epi64(T3, T6);
    591   T5 = _mm_mul_epu32(H4, p->S23.v);
    592   T6 = _mm_mul_epu32(H4, p->S24.v);
    593   T2 = _mm_add_epi64(T2, T5);
    594   T3 = _mm_add_epi64(T3, T6);
    595   T5 = _mm_mul_epu32(H1, p->R23.v);
    596   T4 = _mm_add_epi64(T4, T5);
    597   T5 = _mm_mul_epu32(H2, p->R22.v);
    598   T4 = _mm_add_epi64(T4, T5);
    599   T5 = _mm_mul_epu32(H3, p->R21.v);
    600   T4 = _mm_add_epi64(T4, T5);
    601   T5 = _mm_mul_epu32(H4, p->R20.v);
    602   T4 = _mm_add_epi64(T4, T5);
    603 
    604   C1 = _mm_srli_epi64(T0, 26);
    605   C2 = _mm_srli_epi64(T3, 26);
    606   T0 = _mm_and_si128(T0, MMASK);
    607   T3 = _mm_and_si128(T3, MMASK);
    608   T1 = _mm_add_epi64(T1, C1);
    609   T4 = _mm_add_epi64(T4, C2);
    610   C1 = _mm_srli_epi64(T1, 26);
    611   C2 = _mm_srli_epi64(T4, 26);
    612   T1 = _mm_and_si128(T1, MMASK);
    613   T4 = _mm_and_si128(T4, MMASK);
    614   T2 = _mm_add_epi64(T2, C1);
    615   T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    616   C1 = _mm_srli_epi64(T2, 26);
    617   C2 = _mm_srli_epi64(T0, 26);
    618   T2 = _mm_and_si128(T2, MMASK);
    619   T0 = _mm_and_si128(T0, MMASK);
    620   T3 = _mm_add_epi64(T3, C1);
    621   T1 = _mm_add_epi64(T1, C2);
    622   C1 = _mm_srli_epi64(T3, 26);
    623   T3 = _mm_and_si128(T3, MMASK);
    624   T4 = _mm_add_epi64(T4, C1);
    625 
    626   // H = H[0]+H[1]
    627   H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
    628   H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
    629   H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
    630   H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
    631   H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
    632 
    633   t0 = _mm_cvtsi128_si32(H0);
    634   c = (t0 >> 26);
    635   t0 &= 0x3ffffff;
    636   t1 = _mm_cvtsi128_si32(H1) + c;
    637   c = (t1 >> 26);
    638   t1 &= 0x3ffffff;
    639   t2 = _mm_cvtsi128_si32(H2) + c;
    640   c = (t2 >> 26);
    641   t2 &= 0x3ffffff;
    642   t3 = _mm_cvtsi128_si32(H3) + c;
    643   c = (t3 >> 26);
    644   t3 &= 0x3ffffff;
    645   t4 = _mm_cvtsi128_si32(H4) + c;
    646   c = (t4 >> 26);
    647   t4 &= 0x3ffffff;
    648   t0 = t0 + (c * 5);
    649   c = (t0 >> 26);
    650   t0 &= 0x3ffffff;
    651   t1 = t1 + c;
    652 
    653   st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
    654   st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
    655   st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
    656 
    657   return consumed;
    658 }
    659 
    660 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
    661                             size_t bytes) {
    662   poly1305_state_internal *st = poly1305_aligned_state(state);
    663   size_t want;
    664 
    665   // need at least 32 initial bytes to start the accelerated branch
    666   if (!st->started) {
    667     if ((st->leftover == 0) && (bytes > 32)) {
    668       poly1305_first_block(st, m);
    669       m += 32;
    670       bytes -= 32;
    671     } else {
    672       want = poly1305_min(32 - st->leftover, bytes);
    673       OPENSSL_memcpy(st->buffer + st->leftover, m, want);
    674       bytes -= want;
    675       m += want;
    676       st->leftover += want;
    677       if ((st->leftover < 32) || (bytes == 0)) {
    678         return;
    679       }
    680       poly1305_first_block(st, st->buffer);
    681       st->leftover = 0;
    682     }
    683     st->started = 1;
    684   }
    685 
    686   // handle leftover
    687   if (st->leftover) {
    688     want = poly1305_min(64 - st->leftover, bytes);
    689     OPENSSL_memcpy(st->buffer + st->leftover, m, want);
    690     bytes -= want;
    691     m += want;
    692     st->leftover += want;
    693     if (st->leftover < 64) {
    694       return;
    695     }
    696     poly1305_blocks(st, st->buffer, 64);
    697     st->leftover = 0;
    698   }
    699 
    700   // process 64 byte blocks
    701   if (bytes >= 64) {
    702     want = (bytes & ~63);
    703     poly1305_blocks(st, m, want);
    704     m += want;
    705     bytes -= want;
    706   }
    707 
    708   if (bytes) {
    709     OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
    710     st->leftover += bytes;
    711   }
    712 }
    713 
    714 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
    715   poly1305_state_internal *st = poly1305_aligned_state(state);
    716   size_t leftover = st->leftover;
    717   uint8_t *m = st->buffer;
    718   uint128_t d[3];
    719   uint64_t h0, h1, h2;
    720   uint64_t t0, t1;
    721   uint64_t g0, g1, g2, c, nc;
    722   uint64_t r0, r1, r2, s1, s2;
    723   poly1305_power *p;
    724 
    725   if (st->started) {
    726     size_t consumed = poly1305_combine(st, m, leftover);
    727     leftover -= consumed;
    728     m += consumed;
    729   }
    730 
    731   // st->HH will either be 0 or have the combined result
    732   h0 = st->HH[0];
    733   h1 = st->HH[1];
    734   h2 = st->HH[2];
    735 
    736   p = &st->P[1];
    737   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
    738   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
    739   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
    740   s1 = r1 * (5 << 2);
    741   s2 = r2 * (5 << 2);
    742 
    743   if (leftover < 16) {
    744     goto poly1305_donna_atmost15bytes;
    745   }
    746 
    747 poly1305_donna_atleast16bytes:
    748   t0 = U8TO64_LE(m + 0);
    749   t1 = U8TO64_LE(m + 8);
    750   h0 += t0 & 0xfffffffffff;
    751   t0 = shr128_pair(t1, t0, 44);
    752   h1 += t0 & 0xfffffffffff;
    753   h2 += (t1 >> 24) | ((uint64_t)1 << 40);
    754 
    755 poly1305_donna_mul:
    756   d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
    757                 mul64x64_128(h2, s1));
    758   d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
    759                 mul64x64_128(h2, s2));
    760   d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
    761                 mul64x64_128(h2, r0));
    762   h0 = lo128(d[0]) & 0xfffffffffff;
    763   c = shr128(d[0], 44);
    764   d[1] = add128_64(d[1], c);
    765   h1 = lo128(d[1]) & 0xfffffffffff;
    766   c = shr128(d[1], 44);
    767   d[2] = add128_64(d[2], c);
    768   h2 = lo128(d[2]) & 0x3ffffffffff;
    769   c = shr128(d[2], 42);
    770   h0 += c * 5;
    771 
    772   m += 16;
    773   leftover -= 16;
    774   if (leftover >= 16) {
    775     goto poly1305_donna_atleast16bytes;
    776   }
    777 
    778 // final bytes
    779 poly1305_donna_atmost15bytes:
    780   if (!leftover) {
    781     goto poly1305_donna_finish;
    782   }
    783 
    784   m[leftover++] = 1;
    785   OPENSSL_memset(m + leftover, 0, 16 - leftover);
    786   leftover = 16;
    787 
    788   t0 = U8TO64_LE(m + 0);
    789   t1 = U8TO64_LE(m + 8);
    790   h0 += t0 & 0xfffffffffff;
    791   t0 = shr128_pair(t1, t0, 44);
    792   h1 += t0 & 0xfffffffffff;
    793   h2 += (t1 >> 24);
    794 
    795   goto poly1305_donna_mul;
    796 
    797 poly1305_donna_finish:
    798   c = (h0 >> 44);
    799   h0 &= 0xfffffffffff;
    800   h1 += c;
    801   c = (h1 >> 44);
    802   h1 &= 0xfffffffffff;
    803   h2 += c;
    804   c = (h2 >> 42);
    805   h2 &= 0x3ffffffffff;
    806   h0 += c * 5;
    807 
    808   g0 = h0 + 5;
    809   c = (g0 >> 44);
    810   g0 &= 0xfffffffffff;
    811   g1 = h1 + c;
    812   c = (g1 >> 44);
    813   g1 &= 0xfffffffffff;
    814   g2 = h2 + c - ((uint64_t)1 << 42);
    815 
    816   c = (g2 >> 63) - 1;
    817   nc = ~c;
    818   h0 = (h0 & nc) | (g0 & c);
    819   h1 = (h1 & nc) | (g1 & c);
    820   h2 = (h2 & nc) | (g2 & c);
    821 
    822   // pad
    823   t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
    824   t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
    825   h0 += (t0 & 0xfffffffffff);
    826   c = (h0 >> 44);
    827   h0 &= 0xfffffffffff;
    828   t0 = shr128_pair(t1, t0, 44);
    829   h1 += (t0 & 0xfffffffffff) + c;
    830   c = (h1 >> 44);
    831   h1 &= 0xfffffffffff;
    832   t1 = (t1 >> 24);
    833   h2 += (t1)+c;
    834 
    835   U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
    836   U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
    837 }
    838 
    839 #endif  // !OPENSSL_WINDOWS && OPENSSL_X86_64
    840