Home | History | Annotate | Download | only in poly1305
      1 /* Copyright (c) 2014, Google Inc.
      2  *
      3  * Permission to use, copy, modify, and/or distribute this software for any
      4  * purpose with or without fee is hereby granted, provided that the above
      5  * copyright notice and this permission notice appear in all copies.
      6  *
      7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
     14 
     15 /* This implementation was taken from the public domain, neon2 version in
     16  * SUPERCOP by D. J. Bernstein and Peter Schwabe. */
     17 
     18 #include <openssl/poly1305.h>
     19 
     20 #if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM)
     21 
     22 #include <string.h>
     23 
     24 
     25 typedef struct {
     26   uint32_t v[12]; /* for alignment; only using 10 */
     27 } fe1305x2;
     28 
     29 #define addmulmod openssl_poly1305_neon2_addmulmod
     30 #define blocks openssl_poly1305_neon2_blocks
     31 
     32 extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y,
     33                       const fe1305x2 *c);
     34 
     35 extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in,
     36                   unsigned int inlen);
     37 
     38 static void freeze(fe1305x2 *r) {
     39   int i;
     40 
     41   uint32_t x0 = r->v[0];
     42   uint32_t x1 = r->v[2];
     43   uint32_t x2 = r->v[4];
     44   uint32_t x3 = r->v[6];
     45   uint32_t x4 = r->v[8];
     46   uint32_t y0;
     47   uint32_t y1;
     48   uint32_t y2;
     49   uint32_t y3;
     50   uint32_t y4;
     51   uint32_t swap;
     52 
     53   for (i = 0; i < 3; ++i) {
     54     x1 += x0 >> 26;
     55     x0 &= 0x3ffffff;
     56     x2 += x1 >> 26;
     57     x1 &= 0x3ffffff;
     58     x3 += x2 >> 26;
     59     x2 &= 0x3ffffff;
     60     x4 += x3 >> 26;
     61     x3 &= 0x3ffffff;
     62     x0 += 5 * (x4 >> 26);
     63     x4 &= 0x3ffffff;
     64   }
     65 
     66   y0 = x0 + 5;
     67   y1 = x1 + (y0 >> 26);
     68   y0 &= 0x3ffffff;
     69   y2 = x2 + (y1 >> 26);
     70   y1 &= 0x3ffffff;
     71   y3 = x3 + (y2 >> 26);
     72   y2 &= 0x3ffffff;
     73   y4 = x4 + (y3 >> 26);
     74   y3 &= 0x3ffffff;
     75   swap = -(y4 >> 26);
     76   y4 &= 0x3ffffff;
     77 
     78   y0 ^= x0;
     79   y1 ^= x1;
     80   y2 ^= x2;
     81   y3 ^= x3;
     82   y4 ^= x4;
     83 
     84   y0 &= swap;
     85   y1 &= swap;
     86   y2 &= swap;
     87   y3 &= swap;
     88   y4 &= swap;
     89 
     90   y0 ^= x0;
     91   y1 ^= x1;
     92   y2 ^= x2;
     93   y3 ^= x3;
     94   y4 ^= x4;
     95 
     96   r->v[0] = y0;
     97   r->v[2] = y1;
     98   r->v[4] = y2;
     99   r->v[6] = y3;
    100   r->v[8] = y4;
    101 }
    102 
    103 static void fe1305x2_tobytearray(uint8_t *r, fe1305x2 *x) {
    104   uint32_t x0 = x->v[0];
    105   uint32_t x1 = x->v[2];
    106   uint32_t x2 = x->v[4];
    107   uint32_t x3 = x->v[6];
    108   uint32_t x4 = x->v[8];
    109 
    110   x1 += x0 >> 26;
    111   x0 &= 0x3ffffff;
    112   x2 += x1 >> 26;
    113   x1 &= 0x3ffffff;
    114   x3 += x2 >> 26;
    115   x2 &= 0x3ffffff;
    116   x4 += x3 >> 26;
    117   x3 &= 0x3ffffff;
    118 
    119   *(uint32_t *)r = x0 + (x1 << 26);
    120   *(uint32_t *)(r + 4) = (x1 >> 6) + (x2 << 20);
    121   *(uint32_t *)(r + 8) = (x2 >> 12) + (x3 << 14);
    122   *(uint32_t *)(r + 12) = (x3 >> 18) + (x4 << 8);
    123 }
    124 
    125 /* load32 exists to avoid breaking strict aliasing rules in
    126  * fe1305x2_frombytearray. */
    127 static uint32_t load32(uint8_t *t) {
    128   uint32_t tmp;
    129   memcpy(&tmp, t, sizeof(tmp));
    130   return tmp;
    131 }
    132 
    133 static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x,
    134                                    unsigned long long xlen) {
    135   unsigned i;
    136   uint8_t t[17];
    137 
    138   for (i = 0; (i < 16) && (i < xlen); i++) {
    139     t[i] = x[i];
    140   }
    141   xlen -= i;
    142   x += i;
    143   t[i++] = 1;
    144   for (; i < 17; i++) {
    145     t[i] = 0;
    146   }
    147 
    148   r->v[0] = 0x3ffffff & load32(t);
    149   r->v[2] = 0x3ffffff & (load32(t + 3) >> 2);
    150   r->v[4] = 0x3ffffff & (load32(t + 6) >> 4);
    151   r->v[6] = 0x3ffffff & (load32(t + 9) >> 6);
    152   r->v[8] = load32(t + 13);
    153 
    154   if (xlen) {
    155     for (i = 0; (i < 16) && (i < xlen); i++) {
    156       t[i] = x[i];
    157     }
    158     t[i++] = 1;
    159     for (; i < 17; i++) {
    160       t[i] = 0;
    161     }
    162 
    163     r->v[1] = 0x3ffffff & load32(t);
    164     r->v[3] = 0x3ffffff & (load32(t + 3) >> 2);
    165     r->v[5] = 0x3ffffff & (load32(t + 6) >> 4);
    166     r->v[7] = 0x3ffffff & (load32(t + 9) >> 6);
    167     r->v[9] = load32(t + 13);
    168   } else {
    169     r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0;
    170   }
    171 }
    172 
    173 static const fe1305x2 zero __attribute__((aligned(16)));
    174 
    175 struct poly1305_state_st {
    176   uint8_t data[sizeof(fe1305x2[5]) + 128];
    177   uint8_t buf[32];
    178   unsigned int buf_used;
    179   uint8_t key[16];
    180 };
    181 
    182 void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) {
    183   struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
    184   fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
    185   fe1305x2 *const h = r + 1;
    186   fe1305x2 *const c = h + 1;
    187   fe1305x2 *const precomp = c + 1;
    188   unsigned int j;
    189 
    190   r->v[1] = r->v[0] = 0x3ffffff & *(uint32_t *)key;
    191   r->v[3] = r->v[2] = 0x3ffff03 & ((*(uint32_t *)(key + 3)) >> 2);
    192   r->v[5] = r->v[4] = 0x3ffc0ff & ((*(uint32_t *)(key + 6)) >> 4);
    193   r->v[7] = r->v[6] = 0x3f03fff & ((*(uint32_t *)(key + 9)) >> 6);
    194   r->v[9] = r->v[8] = 0x00fffff & ((*(uint32_t *)(key + 12)) >> 8);
    195 
    196   for (j = 0; j < 10; j++) {
    197     h->v[j] = 0; /* XXX: should fast-forward a bit */
    198   }
    199 
    200   addmulmod(precomp, r, r, &zero);                 /* precompute r^2 */
    201   addmulmod(precomp + 1, precomp, precomp, &zero); /* precompute r^4 */
    202 
    203   memcpy(st->key, key + 16, 16);
    204   st->buf_used = 0;
    205 }
    206 
    207 void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in,
    208                                  size_t in_len) {
    209   struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
    210   fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
    211   fe1305x2 *const h = r + 1;
    212   fe1305x2 *const c = h + 1;
    213   fe1305x2 *const precomp = c + 1;
    214   unsigned int i;
    215 
    216   if (st->buf_used) {
    217     unsigned int todo = 32 - st->buf_used;
    218     if (todo > in_len) {
    219       todo = in_len;
    220     }
    221     for (i = 0; i < todo; i++) {
    222       st->buf[st->buf_used + i] = in[i];
    223     }
    224     st->buf_used += todo;
    225     in_len -= todo;
    226     in += todo;
    227 
    228     if (st->buf_used == sizeof(st->buf) && in_len) {
    229       addmulmod(h, h, precomp, &zero);
    230       fe1305x2_frombytearray(c, st->buf, sizeof(st->buf));
    231       for (i = 0; i < 10; i++) {
    232         h->v[i] += c->v[i];
    233       }
    234       st->buf_used = 0;
    235     }
    236   }
    237 
    238   while (in_len > 32) {
    239     unsigned int tlen = 1048576;
    240     if (in_len < tlen) {
    241       tlen = in_len;
    242     }
    243     tlen -= blocks(h, precomp, in, tlen);
    244     in_len -= tlen;
    245     in += tlen;
    246   }
    247 
    248   if (in_len) {
    249     for (i = 0; i < in_len; i++) {
    250       st->buf[i] = in[i];
    251     }
    252     st->buf_used = in_len;
    253   }
    254 }
    255 
    256 void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) {
    257   struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
    258   fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
    259   fe1305x2 *const h = r + 1;
    260   fe1305x2 *const c = h + 1;
    261   fe1305x2 *const precomp = c + 1;
    262 
    263   addmulmod(h, h, precomp, &zero);
    264 
    265   if (st->buf_used > 16) {
    266     fe1305x2_frombytearray(c, st->buf, st->buf_used);
    267     precomp->v[1] = r->v[1];
    268     precomp->v[3] = r->v[3];
    269     precomp->v[5] = r->v[5];
    270     precomp->v[7] = r->v[7];
    271     precomp->v[9] = r->v[9];
    272     addmulmod(h, h, precomp, c);
    273   } else if (st->buf_used > 0) {
    274     fe1305x2_frombytearray(c, st->buf, st->buf_used);
    275     r->v[1] = 1;
    276     r->v[3] = 0;
    277     r->v[5] = 0;
    278     r->v[7] = 0;
    279     r->v[9] = 0;
    280     addmulmod(h, h, r, c);
    281   }
    282 
    283   h->v[0] += h->v[1];
    284   h->v[2] += h->v[3];
    285   h->v[4] += h->v[5];
    286   h->v[6] += h->v[7];
    287   h->v[8] += h->v[9];
    288   freeze(h);
    289 
    290   fe1305x2_frombytearray(c, st->key, 16);
    291   c->v[8] ^= (1 << 24);
    292 
    293   h->v[0] += c->v[0];
    294   h->v[2] += c->v[2];
    295   h->v[4] += c->v[4];
    296   h->v[6] += c->v[6];
    297   h->v[8] += c->v[8];
    298   fe1305x2_tobytearray(mac, h);
    299 }
    300 
    301 #endif  /* OPENSSL_ARM && !OPENSSL_NO_ASM */
    302