1 /* 2 * version 20110505 3 * D. J. Bernstein 4 * Public domain. 5 * 6 * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419 7 */ 8 9 #define ROUNDS 8 10 static void 11 salsa20_8_intrinsic(void * input) 12 { 13 int i; 14 15 const uint32x4_t abab = {-1,0,-1,0}; 16 17 /* 18 * This is modified since we only have one argument. Usually you'd rearrange 19 * the constant, key, and input bytes, but we just have one linear array to 20 * rearrange which is a bit easier. 21 */ 22 23 /* 24 * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values. 25 */ 26 uint32x4_t x0x5x10x15; 27 uint32x4_t x12x1x6x11; 28 uint32x4_t x8x13x2x7; 29 uint32x4_t x4x9x14x3; 30 31 uint32x4_t x0x1x10x11; 32 uint32x4_t x12x13x6x7; 33 uint32x4_t x8x9x2x3; 34 uint32x4_t x4x5x14x15; 35 36 uint32x4_t x0x1x2x3; 37 uint32x4_t x4x5x6x7; 38 uint32x4_t x8x9x10x11; 39 uint32x4_t x12x13x14x15; 40 41 x0x1x2x3 = vld1q_u8((uint8_t *) input); 42 x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input); 43 x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input); 44 x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input); 45 46 x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11)); 47 x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15)); 48 x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3)); 49 x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7)); 50 51 x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15); 52 x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7); 53 x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3); 54 x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11); 55 56 uint32x4_t start0 = x0x5x10x15; 57 uint32x4_t start1 = x12x1x6x11; 58 uint32x4_t start3 = x4x9x14x3; 59 uint32x4_t start2 = x8x13x2x7; 60 61 /* From here on this should be the same as the SUPERCOP version. */ 62 63 uint32x4_t diag0 = start0; 64 uint32x4_t diag1 = start1; 65 uint32x4_t diag2 = start2; 66 uint32x4_t diag3 = start3; 67 68 uint32x4_t a0; 69 uint32x4_t a1; 70 uint32x4_t a2; 71 uint32x4_t a3; 72 73 for (i = ROUNDS;i > 0;i -= 2) { 74 a0 = diag1 + diag0; 75 diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 76 a1 = diag0 + diag3; 77 diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 78 a2 = diag3 + diag2; 79 diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 80 a3 = diag2 + diag1; 81 diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 82 83 diag3 = vextq_u32(diag3,diag3,3); 84 diag2 = vextq_u32(diag2,diag2,2); 85 diag1 = vextq_u32(diag1,diag1,1); 86 87 a0 = diag3 + diag0; 88 diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 89 a1 = diag0 + diag1; 90 diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 91 a2 = diag1 + diag2; 92 diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 93 a3 = diag2 + diag3; 94 diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 95 96 diag1 = vextq_u32(diag1,diag1,3); 97 diag2 = vextq_u32(diag2,diag2,2); 98 diag3 = vextq_u32(diag3,diag3,1); 99 } 100 101 x0x5x10x15 = diag0 + start0; 102 x12x1x6x11 = diag1 + start1; 103 x8x13x2x7 = diag2 + start2; 104 x4x9x14x3 = diag3 + start3; 105 106 x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); 107 x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); 108 x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); 109 x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); 110 111 x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); 112 x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); 113 x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); 114 x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); 115 116 vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3); 117 vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7); 118 vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11); 119 vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15); 120 } 121