Home | History | Annotate | Download | only in crypto
      1 /*
      2  * version 20110505
      3  * D. J. Bernstein
      4  * Public domain.
      5  *
      6  * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419
      7  */
      8 
      9 #define ROUNDS 8
     10 static void
     11 salsa20_8_intrinsic(void * input)
     12 {
     13   int i;
     14 
     15   const uint32x4_t abab = {-1,0,-1,0};
     16 
     17   /*
     18    * This is modified since we only have one argument. Usually you'd rearrange
     19    * the constant, key, and input bytes, but we just have one linear array to
     20    * rearrange which is a bit easier.
     21    */
     22 
     23   /*
     24    * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values.
     25    */
     26   uint32x4_t x0x5x10x15;
     27   uint32x4_t x12x1x6x11;
     28   uint32x4_t x8x13x2x7;
     29   uint32x4_t x4x9x14x3;
     30 
     31   uint32x4_t x0x1x10x11;
     32   uint32x4_t x12x13x6x7;
     33   uint32x4_t x8x9x2x3;
     34   uint32x4_t x4x5x14x15;
     35 
     36   uint32x4_t x0x1x2x3;
     37   uint32x4_t x4x5x6x7;
     38   uint32x4_t x8x9x10x11;
     39   uint32x4_t x12x13x14x15;
     40 
     41   x0x1x2x3 = vld1q_u8((uint8_t *) input);
     42   x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input);
     43   x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input);
     44   x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input);
     45 
     46   x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11));
     47   x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15));
     48   x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3));
     49   x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7));
     50 
     51   x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15);
     52   x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7);
     53   x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3);
     54   x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11);
     55 
     56   uint32x4_t start0 = x0x5x10x15;
     57   uint32x4_t start1 = x12x1x6x11;
     58   uint32x4_t start3 = x4x9x14x3;
     59   uint32x4_t start2 = x8x13x2x7;
     60 
     61   /* From here on this should be the same as the SUPERCOP version. */
     62 
     63   uint32x4_t diag0 = start0;
     64   uint32x4_t diag1 = start1;
     65   uint32x4_t diag2 = start2;
     66   uint32x4_t diag3 = start3;
     67 
     68   uint32x4_t a0;
     69   uint32x4_t a1;
     70   uint32x4_t a2;
     71   uint32x4_t a3;
     72 
     73   for (i = ROUNDS;i > 0;i -= 2) {
     74     a0 = diag1 + diag0;
     75     diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
     76     a1 = diag0 + diag3;
     77     diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
     78     a2 = diag3 + diag2;
     79     diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
     80     a3 = diag2 + diag1;
     81     diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);
     82 
     83     diag3 = vextq_u32(diag3,diag3,3);
     84     diag2 = vextq_u32(diag2,diag2,2);
     85     diag1 = vextq_u32(diag1,diag1,1);
     86 
     87     a0 = diag3 + diag0;
     88     diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
     89     a1 = diag0 + diag1;
     90     diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
     91     a2 = diag1 + diag2;
     92     diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
     93     a3 = diag2 + diag3;
     94     diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);
     95 
     96     diag1 = vextq_u32(diag1,diag1,3);
     97     diag2 = vextq_u32(diag2,diag2,2);
     98     diag3 = vextq_u32(diag3,diag3,1);
     99   }
    100 
    101   x0x5x10x15 = diag0 + start0;
    102   x12x1x6x11 = diag1 + start1;
    103   x8x13x2x7 = diag2 + start2;
    104   x4x9x14x3 = diag3 + start3;
    105 
    106   x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
    107   x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
    108   x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
    109   x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);
    110 
    111   x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
    112   x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
    113   x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
    114   x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));
    115 
    116   vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3);
    117   vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7);
    118   vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11);
    119   vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15);
    120 }
    121