Home | History | Annotate | Download | only in C
      1 /* AesOpt.c -- Intel's AES
      2 2013-11-12 : Igor Pavlov : Public domain */
      3 
      4 #include "Precomp.h"
      5 
      6 #include "CpuArch.h"
      7 
      8 #ifdef MY_CPU_X86_OR_AMD64
      9 #if _MSC_VER >= 1500
     10 #define USE_INTEL_AES
     11 #endif
     12 #endif
     13 
     14 #ifdef USE_INTEL_AES
     15 
     16 #include <wmmintrin.h>
     17 
     18 void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
     19 {
     20   __m128i m = *p;
     21   for (; numBlocks != 0; numBlocks--, data++)
     22   {
     23     UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
     24     const __m128i *w = p + 3;
     25     m = _mm_xor_si128(m, *data);
     26     m = _mm_xor_si128(m, p[2]);
     27     do
     28     {
     29       m = _mm_aesenc_si128(m, w[0]);
     30       m = _mm_aesenc_si128(m, w[1]);
     31       w += 2;
     32     }
     33     while (--numRounds2 != 0);
     34     m = _mm_aesenc_si128(m, w[0]);
     35     m = _mm_aesenclast_si128(m, w[1]);
     36     *data = m;
     37   }
     38   *p = m;
     39 }
     40 
     41 #define NUM_WAYS 3
     42 
     43 #define AES_OP_W(op, n) { \
     44     const __m128i t = w[n]; \
     45     m0 = op(m0, t); \
     46     m1 = op(m1, t); \
     47     m2 = op(m2, t); \
     48     }
     49 
     50 #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
     51 #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
     52 #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
     53 #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
     54 
     55 void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
     56 {
     57   __m128i iv = *p;
     58   for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
     59   {
     60     UInt32 numRounds2 = *(const UInt32 *)(p + 1);
     61     const __m128i *w = p + numRounds2 * 2;
     62     __m128i m0, m1, m2;
     63     {
     64       const __m128i t = w[2];
     65       m0 = _mm_xor_si128(t, data[0]);
     66       m1 = _mm_xor_si128(t, data[1]);
     67       m2 = _mm_xor_si128(t, data[2]);
     68     }
     69     numRounds2--;
     70     do
     71     {
     72       AES_DEC(1)
     73       AES_DEC(0)
     74       w -= 2;
     75     }
     76     while (--numRounds2 != 0);
     77     AES_DEC(1)
     78     AES_DEC_LAST(0)
     79 
     80     {
     81       __m128i t;
     82       t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
     83       t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
     84       t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
     85     }
     86   }
     87   for (; numBlocks != 0; numBlocks--, data++)
     88   {
     89     UInt32 numRounds2 = *(const UInt32 *)(p + 1);
     90     const __m128i *w = p + numRounds2 * 2;
     91     __m128i m = _mm_xor_si128(w[2], *data);
     92     numRounds2--;
     93     do
     94     {
     95       m = _mm_aesdec_si128(m, w[1]);
     96       m = _mm_aesdec_si128(m, w[0]);
     97       w -= 2;
     98     }
     99     while (--numRounds2 != 0);
    100     m = _mm_aesdec_si128(m, w[1]);
    101     m = _mm_aesdeclast_si128(m, w[0]);
    102 
    103     m = _mm_xor_si128(m, iv);
    104     iv = *data;
    105     *data = m;
    106   }
    107   *p = iv;
    108 }
    109 
    110 void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
    111 {
    112   __m128i ctr = *p;
    113   __m128i one;
    114   one.m128i_u64[0] = 1;
    115   one.m128i_u64[1] = 0;
    116   for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
    117   {
    118     UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
    119     const __m128i *w = p;
    120     __m128i m0, m1, m2;
    121     {
    122       const __m128i t = w[2];
    123       ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
    124       ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
    125       ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
    126     }
    127     w += 3;
    128     do
    129     {
    130       AES_ENC(0)
    131       AES_ENC(1)
    132       w += 2;
    133     }
    134     while (--numRounds2 != 0);
    135     AES_ENC(0)
    136     AES_ENC_LAST(1)
    137     data[0] = _mm_xor_si128(data[0], m0);
    138     data[1] = _mm_xor_si128(data[1], m1);
    139     data[2] = _mm_xor_si128(data[2], m2);
    140   }
    141   for (; numBlocks != 0; numBlocks--, data++)
    142   {
    143     UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
    144     const __m128i *w = p;
    145     __m128i m;
    146     ctr = _mm_add_epi64(ctr, one);
    147     m = _mm_xor_si128(ctr, p[2]);
    148     w += 3;
    149     do
    150     {
    151       m = _mm_aesenc_si128(m, w[0]);
    152       m = _mm_aesenc_si128(m, w[1]);
    153       w += 2;
    154     }
    155     while (--numRounds2 != 0);
    156     m = _mm_aesenc_si128(m, w[0]);
    157     m = _mm_aesenclast_si128(m, w[1]);
    158     *data = _mm_xor_si128(*data, m);
    159   }
    160   *p = ctr;
    161 }
    162 
    163 #else
    164 
    165 void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
    166 void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
    167 void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
    168 
    169 void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
    170 {
    171   AesCbc_Encode(p, data, numBlocks);
    172 }
    173 
    174 void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
    175 {
    176   AesCbc_Decode(p, data, numBlocks);
    177 }
    178 
    179 void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
    180 {
    181   AesCtr_Code(p, data, numBlocks);
    182 }
    183 
    184 #endif
    185