Home | History | Annotate | Download | only in aes
      1 /* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
      2 /**
      3  * rijndael-alg-fst.c
      4  *
      5  * @version 3.0 (December 2000)
      6  *
      7  * Optimised ANSI C code for the Rijndael cipher (now AES)
      8  *
      9  * @author Vincent Rijmen <vincent.rijmen (at) esat.kuleuven.ac.be>
     10  * @author Antoon Bosselaers <antoon.bosselaers (at) esat.kuleuven.ac.be>
     11  * @author Paulo Barreto <paulo.barreto (at) terra.com.br>
     12  *
     13  * This code is hereby placed in the public domain.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
     16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
     19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
     22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
     23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
     24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
     25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 /*
     29  * This is experimental x86[_64] derivative. It assumes little-endian
     30  * byte order and expects CPU to sustain unaligned memory references.
     31  * It is used as playground for cache-time attack mitigations and
     32  * serves as reference C implementation for x86[_64] assembler.
     33  *
     34  *					<appro (at) fy.chalmers.se>
     35  */
     36 
     37 
     38 #ifndef AES_DEBUG
     39 # ifndef NDEBUG
     40 #  define NDEBUG
     41 # endif
     42 #endif
     43 #include <assert.h>
     44 
     45 #include <stdlib.h>
     46 #include <openssl/aes.h>
     47 #include "aes_locl.h"
     48 
     49 /*
     50  * These two parameters control which table, 256-byte or 2KB, is
     51  * referenced in outer and respectively inner rounds.
     52  */
     53 #define AES_COMPACT_IN_OUTER_ROUNDS
     54 #ifdef  AES_COMPACT_IN_OUTER_ROUNDS
     55 /* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
     56  * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
     57  * by factor of ~2. */
     58 # undef  AES_COMPACT_IN_INNER_ROUNDS
     59 #endif
     60 
     61 #if 1
     62 static void prefetch256(const void *table)
     63 {
     64 	volatile unsigned long *t=(void *)table,ret;
     65 	unsigned long sum;
     66 	int i;
     67 
     68 	/* 32 is common least cache-line size */
     69 	for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))	sum ^= t[i];
     70 
     71 	ret = sum;
     72 }
     73 #else
     74 # define prefetch256(t)
     75 #endif
     76 
     77 #undef GETU32
     78 #define GETU32(p) (*((u32*)(p)))
     79 
     80 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
     81 typedef unsigned __int64 u64;
     82 #define U64(C)	C##UI64
     83 #elif defined(__arch64__)
     84 typedef unsigned long u64;
     85 #define U64(C)	C##UL
     86 #else
     87 typedef unsigned long long u64;
     88 #define U64(C)	C##ULL
     89 #endif
     90 
     91 #undef ROTATE
     92 #if defined(_MSC_VER) || defined(__ICC)
     93 # define ROTATE(a,n)	_lrotl(a,n)
     94 #elif defined(__GNUC__) && __GNUC__>=2
     95 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
     96 #   define ROTATE(a,n)	({ register unsigned int ret;	\
     97 				asm (			\
     98 				"roll %1,%0"		\
     99 				: "=r"(ret)		\
    100 				: "I"(n), "0"(a)	\
    101 				: "cc");		\
    102 			   ret;				\
    103 			})
    104 # endif
    105 #endif
    106 /*
    107 Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
    108 Te0[x] = S [x].[02, 01, 01, 03];
    109 Te1[x] = S [x].[03, 02, 01, 01];
    110 Te2[x] = S [x].[01, 03, 02, 01];
    111 Te3[x] = S [x].[01, 01, 03, 02];
    112 */
    113 #define Te0 (u32)((u64*)((u8*)Te+0))
    114 #define Te1 (u32)((u64*)((u8*)Te+3))
    115 #define Te2 (u32)((u64*)((u8*)Te+2))
    116 #define Te3 (u32)((u64*)((u8*)Te+1))
    117 /*
    118 Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
    119 Td0[x] = Si[x].[0e, 09, 0d, 0b];
    120 Td1[x] = Si[x].[0b, 0e, 09, 0d];
    121 Td2[x] = Si[x].[0d, 0b, 0e, 09];
    122 Td3[x] = Si[x].[09, 0d, 0b, 0e];
    123 Td4[x] = Si[x].[01];
    124 */
    125 #define Td0 (u32)((u64*)((u8*)Td+0))
    126 #define Td1 (u32)((u64*)((u8*)Td+3))
    127 #define Td2 (u32)((u64*)((u8*)Td+2))
    128 #define Td3 (u32)((u64*)((u8*)Td+1))
    129 
    130 static const u64 Te[256] = {
    131     U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
    132     U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
    133     U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
    134     U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
    135     U64(0x5030306050303060), U64(0x0301010203010102),
    136     U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
    137     U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
    138     U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
    139     U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
    140     U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
    141     U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
    142     U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
    143     U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
    144     U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
    145     U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
    146     U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
    147     U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
    148     U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
    149     U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
    150     U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
    151     U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
    152     U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
    153     U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
    154     U64(0x5331316253313162), U64(0x3f15152a3f15152a),
    155     U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
    156     U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
    157     U64(0x2818183028181830), U64(0xa1969637a1969637),
    158     U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
    159     U64(0x0907070e0907070e), U64(0x3612122436121224),
    160     U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
    161     U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
    162     U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
    163     U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
    164     U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
    165     U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
    166     U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
    167     U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
    168     U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
    169     U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
    170     U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
    171     U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
    172     U64(0x0000000000000000), U64(0x2cededc12cededc1),
    173     U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
    174     U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
    175     U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
    176     U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
    177     U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
    178     U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
    179     U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
    180     U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
    181     U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
    182     U64(0x5533336655333366), U64(0x9485851194858511),
    183     U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
    184     U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
    185     U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
    186     U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
    187     U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
    188     U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
    189     U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
    190     U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
    191     U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
    192     U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
    193     U64(0x3010102030101020), U64(0x1affffe51affffe5),
    194     U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
    195     U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
    196     U64(0x3513132635131326), U64(0x2fececc32fececc3),
    197     U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
    198     U64(0xcc444488cc444488), U64(0x3917172e3917172e),
    199     U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
    200     U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
    201     U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
    202     U64(0x2b1919322b191932), U64(0x957373e6957373e6),
    203     U64(0xa06060c0a06060c0), U64(0x9881811998818119),
    204     U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
    205     U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
    206     U64(0xab90903bab90903b), U64(0x8388880b8388880b),
    207     U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
    208     U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
    209     U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
    210     U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
    211     U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
    212     U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
    213     U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
    214     U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
    215     U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
    216     U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
    217     U64(0xa8919139a8919139), U64(0xa4959531a4959531),
    218     U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
    219     U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
    220     U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
    221     U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
    222     U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
    223     U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
    224     U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
    225     U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
    226     U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
    227     U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
    228     U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
    229     U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
    230     U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
    231     U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
    232     U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
    233     U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
    234     U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
    235     U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
    236     U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
    237     U64(0xd8484890d8484890), U64(0x0503030605030306),
    238     U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
    239     U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
    240     U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
    241     U64(0x9186861791868617), U64(0x58c1c19958c1c199),
    242     U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
    243     U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
    244     U64(0xb398982bb398982b), U64(0x3311112233111122),
    245     U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
    246     U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
    247     U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
    248     U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
    249     U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
    250     U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
    251     U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
    252     U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
    253     U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
    254     U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
    255     U64(0xc3414182c3414182), U64(0xb0999929b0999929),
    256     U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
    257     U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
    258     U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
    259 };
    260 
    261 static const u8 Te4[256] = {
    262     0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
    263     0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
    264     0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
    265     0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
    266     0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
    267     0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
    268     0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
    269     0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
    270     0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
    271     0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
    272     0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
    273     0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
    274     0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
    275     0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
    276     0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
    277     0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
    278     0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
    279     0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
    280     0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
    281     0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
    282     0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
    283     0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
    284     0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
    285     0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
    286     0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
    287     0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
    288     0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
    289     0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
    290     0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
    291     0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
    292     0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
    293     0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
    294 };
    295 
    296 static const u64 Td[256] = {
    297     U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
    298     U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
    299     U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
    300     U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
    301     U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
    302     U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
    303     U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
    304     U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
    305     U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
    306     U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
    307     U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
    308     U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
    309     U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
    310     U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
    311     U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
    312     U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
    313     U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
    314     U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
    315     U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
    316     U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
    317     U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
    318     U64(0x6033519760335197), U64(0x457f5362457f5362),
    319     U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
    320     U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
    321     U64(0x5868487058684870), U64(0x19fd458f19fd458f),
    322     U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
    323     U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
    324     U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
    325     U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
    326     U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
    327     U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
    328     U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
    329     U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
    330     U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
    331     U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
    332     U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
    333     U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
    334     U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
    335     U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
    336     U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
    337     U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
    338     U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
    339     U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
    340     U64(0x6fd406046fd40604), U64(0xff155060ff155060),
    341     U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
    342     U64(0xcc434089cc434089), U64(0x779ed967779ed967),
    343     U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
    344     U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
    345     U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
    346     U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
    347     U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
    348     U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
    349     U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
    350     U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
    351     U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
    352     U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
    353     U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
    354     U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
    355     U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
    356     U64(0x694b775a694b775a), U64(0x161a121c161a121c),
    357     U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
    358     U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
    359     U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
    360     U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
    361     U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
    362     U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
    363     U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
    364     U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
    365     U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
    366     U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
    367     U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
    368     U64(0x4022971340229713), U64(0x2011c6842011c684),
    369     U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
    370     U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
    371     U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
    372     U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
    373     U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
    374     U64(0xfa489411fa489411), U64(0x2264e9472264e947),
    375     U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
    376     U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
    377     U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
    378     U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
    379     U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
    380     U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
    381     U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
    382     U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
    383     U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
    384     U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
    385     U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
    386     U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
    387     U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
    388     U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
    389     U64(0x097826cd097826cd), U64(0xf418596ef418596e),
    390     U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
    391     U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
    392     U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
    393     U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
    394     U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
    395     U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
    396     U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
    397     U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
    398     U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
    399     U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
    400     U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
    401     U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
    402     U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
    403     U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
    404     U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
    405     U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
    406     U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
    407     U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
    408     U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
    409     U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
    410     U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
    411     U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
    412     U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
    413     U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
    414     U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
    415     U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
    416     U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
    417     U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
    418     U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
    419     U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
    420     U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
    421     U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
    422     U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
    423     U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
    424     U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
    425 };
    426 static const u8 Td4[256] = {
    427     0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
    428     0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
    429     0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
    430     0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
    431     0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
    432     0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
    433     0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
    434     0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
    435     0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
    436     0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
    437     0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
    438     0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
    439     0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
    440     0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
    441     0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
    442     0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
    443     0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
    444     0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
    445     0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
    446     0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
    447     0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
    448     0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
    449     0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
    450     0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
    451     0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
    452     0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
    453     0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
    454     0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
    455     0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
    456     0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
    457     0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
    458     0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
    459 };
    460 
    461 static const u32 rcon[] = {
    462     0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
    463     0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
    464     0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
    465 };
    466 
    467 /**
    468  * Expand the cipher key into the encryption key schedule.
    469  */
    470 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
    471 			AES_KEY *key) {
    472 
    473 	u32 *rk;
    474    	int i = 0;
    475 	u32 temp;
    476 
    477 	if (!userKey || !key)
    478 		return -1;
    479 	if (bits != 128 && bits != 192 && bits != 256)
    480 		return -2;
    481 
    482 	rk = key->rd_key;
    483 
    484 	if (bits==128)
    485 		key->rounds = 10;
    486 	else if (bits==192)
    487 		key->rounds = 12;
    488 	else
    489 		key->rounds = 14;
    490 
    491 	rk[0] = GETU32(userKey     );
    492 	rk[1] = GETU32(userKey +  4);
    493 	rk[2] = GETU32(userKey +  8);
    494 	rk[3] = GETU32(userKey + 12);
    495 	if (bits == 128) {
    496 		while (1) {
    497 			temp  = rk[3];
    498 			rk[4] = rk[0] ^
    499 				(Te4[(temp >>  8) & 0xff]      ) ^
    500 				(Te4[(temp >> 16) & 0xff] <<  8) ^
    501 				(Te4[(temp >> 24)       ] << 16) ^
    502 				(Te4[(temp      ) & 0xff] << 24) ^
    503 				rcon[i];
    504 			rk[5] = rk[1] ^ rk[4];
    505 			rk[6] = rk[2] ^ rk[5];
    506 			rk[7] = rk[3] ^ rk[6];
    507 			if (++i == 10) {
    508 				return 0;
    509 			}
    510 			rk += 4;
    511 		}
    512 	}
    513 	rk[4] = GETU32(userKey + 16);
    514 	rk[5] = GETU32(userKey + 20);
    515 	if (bits == 192) {
    516 		while (1) {
    517 			temp = rk[ 5];
    518 			rk[ 6] = rk[ 0] ^
    519 				(Te4[(temp >>  8) & 0xff]      ) ^
    520 				(Te4[(temp >> 16) & 0xff] <<  8) ^
    521 				(Te4[(temp >> 24)       ] << 16) ^
    522 				(Te4[(temp      ) & 0xff] << 24) ^
    523 				rcon[i];
    524 			rk[ 7] = rk[ 1] ^ rk[ 6];
    525 			rk[ 8] = rk[ 2] ^ rk[ 7];
    526 			rk[ 9] = rk[ 3] ^ rk[ 8];
    527 			if (++i == 8) {
    528 				return 0;
    529 			}
    530 			rk[10] = rk[ 4] ^ rk[ 9];
    531 			rk[11] = rk[ 5] ^ rk[10];
    532 			rk += 6;
    533 		}
    534 	}
    535 	rk[6] = GETU32(userKey + 24);
    536 	rk[7] = GETU32(userKey + 28);
    537 	if (bits == 256) {
    538 		while (1) {
    539 			temp = rk[ 7];
    540 			rk[ 8] = rk[ 0] ^
    541 				(Te4[(temp >>  8) & 0xff]      ) ^
    542 				(Te4[(temp >> 16) & 0xff] <<  8) ^
    543 				(Te4[(temp >> 24)       ] << 16) ^
    544 				(Te4[(temp      ) & 0xff] << 24) ^
    545 				rcon[i];
    546 			rk[ 9] = rk[ 1] ^ rk[ 8];
    547 			rk[10] = rk[ 2] ^ rk[ 9];
    548 			rk[11] = rk[ 3] ^ rk[10];
    549 			if (++i == 7) {
    550 				return 0;
    551 			}
    552 			temp = rk[11];
    553 			rk[12] = rk[ 4] ^
    554 				(Te4[(temp      ) & 0xff]      ) ^
    555 				(Te4[(temp >>  8) & 0xff] <<  8) ^
    556 				(Te4[(temp >> 16) & 0xff] << 16) ^
    557 				(Te4[(temp >> 24)       ] << 24);
    558 			rk[13] = rk[ 5] ^ rk[12];
    559 			rk[14] = rk[ 6] ^ rk[13];
    560 			rk[15] = rk[ 7] ^ rk[14];
    561 
    562 			rk += 8;
    563         	}
    564 	}
    565 	return 0;
    566 }
    567 
    568 /**
    569  * Expand the cipher key into the decryption key schedule.
    570  */
    571 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
    572 			 AES_KEY *key) {
    573 
    574         u32 *rk;
    575 	int i, j, status;
    576 	u32 temp;
    577 
    578 	/* first, start with an encryption schedule */
    579 	status = AES_set_encrypt_key(userKey, bits, key);
    580 	if (status < 0)
    581 		return status;
    582 
    583 	rk = key->rd_key;
    584 
    585 	/* invert the order of the round keys: */
    586 	for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
    587 		temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
    588 		temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
    589 		temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
    590 		temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
    591 	}
    592 	/* apply the inverse MixColumn transform to all round keys but the first and the last: */
    593 	for (i = 1; i < (key->rounds); i++) {
    594 		rk += 4;
    595 #if 1
    596 		for (j = 0; j < 4; j++) {
    597 			u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
    598 
    599 			tp1 = rk[j];
    600 			m = tp1 & 0x80808080;
    601 			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
    602 				((m - (m >> 7)) & 0x1b1b1b1b);
    603 			m = tp2 & 0x80808080;
    604 			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
    605 				((m - (m >> 7)) & 0x1b1b1b1b);
    606 			m = tp4 & 0x80808080;
    607 			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
    608 				((m - (m >> 7)) & 0x1b1b1b1b);
    609 			tp9 = tp8 ^ tp1;
    610 			tpb = tp9 ^ tp2;
    611 			tpd = tp9 ^ tp4;
    612 			tpe = tp8 ^ tp4 ^ tp2;
    613 #if defined(ROTATE)
    614 			rk[j] = tpe ^ ROTATE(tpd,16) ^
    615 				ROTATE(tp9,8) ^ ROTATE(tpb,24);
    616 #else
    617 			rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
    618 				(tp9 >> 24) ^ (tp9 << 8) ^
    619 				(tpb >> 8) ^ (tpb << 24);
    620 #endif
    621 		}
    622 #else
    623 		rk[0] =
    624 			Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
    625 			Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
    626 			Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
    627 			Td3[Te2[(rk[0] >> 24)       ] & 0xff];
    628 		rk[1] =
    629 			Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
    630 			Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
    631 			Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
    632 			Td3[Te2[(rk[1] >> 24)       ] & 0xff];
    633 		rk[2] =
    634 			Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
    635 			Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
    636 			Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
    637 			Td3[Te2[(rk[2] >> 24)       ] & 0xff];
    638 		rk[3] =
    639 			Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
    640 			Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
    641 			Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
    642 			Td3[Te2[(rk[3] >> 24)       ] & 0xff];
    643 #endif
    644 	}
    645 	return 0;
    646 }
    647 
    648 /*
    649  * Encrypt a single block
    650  * in and out can overlap
    651  */
    652 void AES_encrypt(const unsigned char *in, unsigned char *out,
    653 		 const AES_KEY *key) {
    654 
    655 	const u32 *rk;
    656 	u32 s0, s1, s2, s3, t[4];
    657 	int r;
    658 
    659 	assert(in && out && key);
    660 	rk = key->rd_key;
    661 
    662 	/*
    663 	 * map byte array block to cipher state
    664 	 * and add initial round key:
    665 	 */
    666 	s0 = GETU32(in     ) ^ rk[0];
    667 	s1 = GETU32(in +  4) ^ rk[1];
    668 	s2 = GETU32(in +  8) ^ rk[2];
    669 	s3 = GETU32(in + 12) ^ rk[3];
    670 
    671 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
    672 	prefetch256(Te4);
    673 
    674 	t[0] =	Te4[(s0      ) & 0xff]       ^
    675 		Te4[(s1 >>  8) & 0xff] <<  8 ^
    676 		Te4[(s2 >> 16) & 0xff] << 16 ^
    677 		Te4[(s3 >> 24)       ] << 24;
    678 	t[1] =	Te4[(s1      ) & 0xff]       ^
    679 		Te4[(s2 >>  8) & 0xff] <<  8 ^
    680 		Te4[(s3 >> 16) & 0xff] << 16 ^
    681 		Te4[(s0 >> 24)       ] << 24;
    682 	t[2] =	Te4[(s2      ) & 0xff]       ^
    683 		Te4[(s3 >>  8) & 0xff] <<  8 ^
    684 		Te4[(s0 >> 16) & 0xff] << 16 ^
    685 		Te4[(s1 >> 24)       ] << 24;
    686 	t[3] =	Te4[(s3      ) & 0xff]       ^
    687 		Te4[(s0 >>  8) & 0xff] <<  8 ^
    688 		Te4[(s1 >> 16) & 0xff] << 16 ^
    689 		Te4[(s2 >> 24)       ] << 24;
    690 
    691 	/* now do the linear transform using words */
    692 	{	int i;
    693 		u32 r0, r1, r2;
    694 
    695 		for (i = 0; i < 4; i++) {
    696 			r0 = t[i];
    697 			r1 = r0 & 0x80808080;
    698 			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
    699 				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
    700 #if defined(ROTATE)
    701 			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
    702 				ROTATE(r0,16) ^ ROTATE(r0,8);
    703 #else
    704 			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
    705 				(r0 << 16) ^ (r0 >> 16) ^
    706 				(r0 << 8) ^ (r0 >> 24);
    707 #endif
    708 			t[i] ^= rk[4+i];
    709 		}
    710 	}
    711 #else
    712 	t[0] =	Te0[(s0      ) & 0xff] ^
    713 		Te1[(s1 >>  8) & 0xff] ^
    714 		Te2[(s2 >> 16) & 0xff] ^
    715 		Te3[(s3 >> 24)       ] ^
    716 		rk[4];
    717 	t[1] =	Te0[(s1      ) & 0xff] ^
    718 		Te1[(s2 >>  8) & 0xff] ^
    719 		Te2[(s3 >> 16) & 0xff] ^
    720 		Te3[(s0 >> 24)       ] ^
    721 		rk[5];
    722 	t[2] =	Te0[(s2      ) & 0xff] ^
    723 		Te1[(s3 >>  8) & 0xff] ^
    724 		Te2[(s0 >> 16) & 0xff] ^
    725 		Te3[(s1 >> 24)       ] ^
    726 		rk[6];
    727 	t[3] =	Te0[(s3      ) & 0xff] ^
    728 		Te1[(s0 >>  8) & 0xff] ^
    729 		Te2[(s1 >> 16) & 0xff] ^
    730 		Te3[(s2 >> 24)       ] ^
    731 		rk[7];
    732 #endif
    733 	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
    734 
    735     /*
    736      * Nr - 2 full rounds:
    737      */
    738     for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
    739 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
    740 	t[0] =	Te4[(s0      ) & 0xff]       ^
    741 		Te4[(s1 >>  8) & 0xff] <<  8 ^
    742 		Te4[(s2 >> 16) & 0xff] << 16 ^
    743 		Te4[(s3 >> 24)       ] << 24;
    744 	t[1] =	Te4[(s1      ) & 0xff]       ^
    745 		Te4[(s2 >>  8) & 0xff] <<  8 ^
    746 		Te4[(s3 >> 16) & 0xff] << 16 ^
    747 		Te4[(s0 >> 24)       ] << 24;
    748 	t[2] =	Te4[(s2      ) & 0xff]       ^
    749 		Te4[(s3 >>  8) & 0xff] <<  8 ^
    750 		Te4[(s0 >> 16) & 0xff] << 16 ^
    751 		Te4[(s1 >> 24)       ] << 24;
    752 	t[3] =	Te4[(s3      ) & 0xff]       ^
    753 		Te4[(s0 >>  8) & 0xff] <<  8 ^
    754 		Te4[(s1 >> 16) & 0xff] << 16 ^
    755 		Te4[(s2 >> 24)       ] << 24;
    756 
    757 	/* now do the linear transform using words */
    758 	{	int i;
    759 		u32 r0, r1, r2;
    760 
    761 		for (i = 0; i < 4; i++) {
    762 			r0 = t[i];
    763 			r1 = r0 & 0x80808080;
    764 			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
    765 				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
    766 #if defined(ROTATE)
    767 			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
    768 				ROTATE(r0,16) ^ ROTATE(r0,8);
    769 #else
    770 			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
    771 				(r0 << 16) ^ (r0 >> 16) ^
    772 				(r0 << 8) ^ (r0 >> 24);
    773 #endif
    774 			t[i] ^= rk[i];
    775 		}
    776 	}
    777 #else
    778 	t[0] =	Te0[(s0      ) & 0xff] ^
    779 		Te1[(s1 >>  8) & 0xff] ^
    780 		Te2[(s2 >> 16) & 0xff] ^
    781 		Te3[(s3 >> 24)       ] ^
    782 		rk[0];
    783 	t[1] =	Te0[(s1      ) & 0xff] ^
    784 		Te1[(s2 >>  8) & 0xff] ^
    785 		Te2[(s3 >> 16) & 0xff] ^
    786 		Te3[(s0 >> 24)       ] ^
    787 		rk[1];
    788 	t[2] =	Te0[(s2      ) & 0xff] ^
    789 		Te1[(s3 >>  8) & 0xff] ^
    790 		Te2[(s0 >> 16) & 0xff] ^
    791 		Te3[(s1 >> 24)       ] ^
    792 		rk[2];
    793 	t[3] =	Te0[(s3      ) & 0xff] ^
    794 		Te1[(s0 >>  8) & 0xff] ^
    795 		Te2[(s1 >> 16) & 0xff] ^
    796 		Te3[(s2 >> 24)       ] ^
    797 		rk[3];
    798 #endif
    799 	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
    800     }
    801     /*
    802 	 * apply last round and
    803 	 * map cipher state to byte array block:
    804 	 */
    805 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
    806 	prefetch256(Te4);
    807 
    808 	*(u32*)(out+0) =
    809 		Te4[(s0      ) & 0xff]       ^
    810 		Te4[(s1 >>  8) & 0xff] <<  8 ^
    811 		Te4[(s2 >> 16) & 0xff] << 16 ^
    812 		Te4[(s3 >> 24)       ] << 24 ^
    813 		rk[0];
    814 	*(u32*)(out+4) =
    815 		Te4[(s1      ) & 0xff]       ^
    816 		Te4[(s2 >>  8) & 0xff] <<  8 ^
    817 		Te4[(s3 >> 16) & 0xff] << 16 ^
    818 		Te4[(s0 >> 24)       ] << 24 ^
    819 		rk[1];
    820 	*(u32*)(out+8) =
    821 		Te4[(s2      ) & 0xff]       ^
    822 		Te4[(s3 >>  8) & 0xff] <<  8 ^
    823 		Te4[(s0 >> 16) & 0xff] << 16 ^
    824 		Te4[(s1 >> 24)       ] << 24 ^
    825 		rk[2];
    826 	*(u32*)(out+12) =
    827 		Te4[(s3      ) & 0xff]       ^
    828 		Te4[(s0 >>  8) & 0xff] <<  8 ^
    829 		Te4[(s1 >> 16) & 0xff] << 16 ^
    830 		Te4[(s2 >> 24)       ] << 24 ^
    831 		rk[3];
    832 #else
    833 	*(u32*)(out+0) =
    834 		(Te2[(s0      ) & 0xff] & 0x000000ffU) ^
    835 		(Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
    836 		(Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
    837 		(Te1[(s3 >> 24)       ] & 0xff000000U) ^
    838 		rk[0];
    839 	*(u32*)(out+4) =
    840 		(Te2[(s1      ) & 0xff] & 0x000000ffU) ^
    841 		(Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
    842 		(Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
    843 		(Te1[(s0 >> 24)       ] & 0xff000000U) ^
    844 		rk[1];
    845 	*(u32*)(out+8) =
    846 		(Te2[(s2      ) & 0xff] & 0x000000ffU) ^
    847 		(Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
    848 		(Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
    849 		(Te1[(s1 >> 24)       ] & 0xff000000U) ^
    850 		rk[2];
    851 	*(u32*)(out+12) =
    852 		(Te2[(s3      ) & 0xff] & 0x000000ffU) ^
    853 		(Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
    854 		(Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
    855 		(Te1[(s2 >> 24)       ] & 0xff000000U) ^
    856 		rk[3];
    857 #endif
    858 }
    859 
    860 /*
    861  * Decrypt a single block
    862  * in and out can overlap
    863  */
    864 void AES_decrypt(const unsigned char *in, unsigned char *out,
    865 		 const AES_KEY *key) {
    866 
    867 	const u32 *rk;
    868 	u32 s0, s1, s2, s3, t[4];
    869 	int r;
    870 
    871 	assert(in && out && key);
    872 	rk = key->rd_key;
    873 
    874 	/*
    875 	 * map byte array block to cipher state
    876 	 * and add initial round key:
    877 	 */
    878 	s0 = GETU32(in     ) ^ rk[0];
    879 	s1 = GETU32(in +  4) ^ rk[1];
    880 	s2 = GETU32(in +  8) ^ rk[2];
    881 	s3 = GETU32(in + 12) ^ rk[3];
    882 
    883 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
    884 	prefetch256(Td4);
    885 
    886         t[0] =	Td4[(s0      ) & 0xff]       ^
    887 		Td4[(s3 >>  8) & 0xff] <<  8 ^
    888 		Td4[(s2 >> 16) & 0xff] << 16 ^
    889 		Td4[(s1 >> 24)       ] << 24;
    890         t[1] =	Td4[(s1      ) & 0xff]       ^
    891 		Td4[(s0 >>  8) & 0xff] <<  8 ^
    892 		Td4[(s3 >> 16) & 0xff] << 16 ^
    893 		Td4[(s2 >> 24)       ] << 24;
    894         t[2] =	Td4[(s2      ) & 0xff]       ^
    895 		Td4[(s1 >>  8) & 0xff] <<  8 ^
    896 		Td4[(s0 >> 16) & 0xff] << 16 ^
    897 		Td4[(s3 >> 24)       ] << 24;
    898         t[3] =	Td4[(s3      ) & 0xff]       ^
    899 		Td4[(s2 >>  8) & 0xff] <<  8 ^
    900 		Td4[(s1 >> 16) & 0xff] << 16 ^
    901 		Td4[(s0 >> 24)       ] << 24;
    902 
    903 	/* now do the linear transform using words */
    904 	{	int i;
    905 		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
    906 
    907 		for (i = 0; i < 4; i++) {
    908 			tp1 = t[i];
    909 			m = tp1 & 0x80808080;
    910 			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
    911 				((m - (m >> 7)) & 0x1b1b1b1b);
    912 			m = tp2 & 0x80808080;
    913 			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
    914 				((m - (m >> 7)) & 0x1b1b1b1b);
    915 			m = tp4 & 0x80808080;
    916 			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
    917 				((m - (m >> 7)) & 0x1b1b1b1b);
    918 			tp9 = tp8 ^ tp1;
    919 			tpb = tp9 ^ tp2;
    920 			tpd = tp9 ^ tp4;
    921 			tpe = tp8 ^ tp4 ^ tp2;
    922 #if defined(ROTATE)
    923 			t[i] = tpe ^ ROTATE(tpd,16) ^
    924 				ROTATE(tp9,8) ^ ROTATE(tpb,24);
    925 #else
    926 			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
    927 				(tp9 >> 24) ^ (tp9 << 8) ^
    928 				(tpb >> 8) ^ (tpb << 24);
    929 #endif
    930 			t[i] ^= rk[4+i];
    931 		}
    932 	}
    933 #else
    934 	t[0] =	Td0[(s0      ) & 0xff] ^
    935 		Td1[(s3 >>  8) & 0xff] ^
    936 		Td2[(s2 >> 16) & 0xff] ^
    937 		Td3[(s1 >> 24)       ] ^
    938 		rk[4];
    939 	t[1] =	Td0[(s1      ) & 0xff] ^
    940 		Td1[(s0 >>  8) & 0xff] ^
    941 		Td2[(s3 >> 16) & 0xff] ^
    942 		Td3[(s2 >> 24)       ] ^
    943 		rk[5];
    944 	t[2] =	Td0[(s2      ) & 0xff] ^
    945 		Td1[(s1 >>  8) & 0xff] ^
    946 		Td2[(s0 >> 16) & 0xff] ^
    947 		Td3[(s3 >> 24)       ] ^
    948 		rk[6];
    949 	t[3] =	Td0[(s3      ) & 0xff] ^
    950 		Td1[(s2 >>  8) & 0xff] ^
    951 		Td2[(s1 >> 16) & 0xff] ^
    952 		Td3[(s0 >> 24)       ] ^
    953 		rk[7];
    954 #endif
    955 	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
    956 
    957     /*
    958      * Nr - 2 full rounds:
    959      */
    960     for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
    961 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
    962         t[0] =	Td4[(s0      ) & 0xff]       ^
    963 		Td4[(s3 >>  8) & 0xff] <<  8 ^
    964 		Td4[(s2 >> 16) & 0xff] << 16 ^
    965 		Td4[(s1 >> 24)       ] << 24;
    966         t[1] =	Td4[(s1      ) & 0xff]       ^
    967 		Td4[(s0 >>  8) & 0xff] <<  8 ^
    968 		Td4[(s3 >> 16) & 0xff] << 16 ^
    969 		Td4[(s2 >> 24)       ] << 24;
    970         t[2] =	Td4[(s2      ) & 0xff]       ^
    971 		Td4[(s1 >>  8) & 0xff] <<  8 ^
    972 		Td4[(s0 >> 16) & 0xff] << 16 ^
    973 		Td4[(s3 >> 24)       ] << 24;
    974         t[3] =	Td4[(s3      ) & 0xff]       ^
    975 		Td4[(s2 >>  8) & 0xff] <<  8 ^
    976 		Td4[(s1 >> 16) & 0xff] << 16 ^
    977 		Td4[(s0 >> 24)       ] << 24;
    978 
    979 	/* now do the linear transform using words */
    980 	{	int i;
    981 		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
    982 
    983 		for (i = 0; i < 4; i++) {
    984 			tp1 = t[i];
    985 			m = tp1 & 0x80808080;
    986 			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
    987 				((m - (m >> 7)) & 0x1b1b1b1b);
    988 			m = tp2 & 0x80808080;
    989 			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
    990 				((m - (m >> 7)) & 0x1b1b1b1b);
    991 			m = tp4 & 0x80808080;
    992 			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
    993 				((m - (m >> 7)) & 0x1b1b1b1b);
    994 			tp9 = tp8 ^ tp1;
    995 			tpb = tp9 ^ tp2;
    996 			tpd = tp9 ^ tp4;
    997 			tpe = tp8 ^ tp4 ^ tp2;
    998 #if defined(ROTATE)
    999 			t[i] = tpe ^ ROTATE(tpd,16) ^
   1000 				ROTATE(tp9,8) ^ ROTATE(tpb,24);
   1001 #else
   1002 			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
   1003 				(tp9 >> 24) ^ (tp9 << 8) ^
   1004 				(tpb >> 8) ^ (tpb << 24);
   1005 #endif
   1006 			t[i] ^= rk[i];
   1007 		}
   1008 	}
   1009 #else
   1010 	t[0] =	Td0[(s0      ) & 0xff] ^
   1011 		Td1[(s3 >>  8) & 0xff] ^
   1012 		Td2[(s2 >> 16) & 0xff] ^
   1013 		Td3[(s1 >> 24)       ] ^
   1014 		rk[0];
   1015 	t[1] =	Td0[(s1      ) & 0xff] ^
   1016 		Td1[(s0 >>  8) & 0xff] ^
   1017 		Td2[(s3 >> 16) & 0xff] ^
   1018 		Td3[(s2 >> 24)       ] ^
   1019 		rk[1];
   1020 	t[2] =	Td0[(s2      ) & 0xff] ^
   1021 		Td1[(s1 >>  8) & 0xff] ^
   1022 		Td2[(s0 >> 16) & 0xff] ^
   1023 		Td3[(s3 >> 24)       ] ^
   1024 		rk[2];
   1025 	t[3] =	Td0[(s3      ) & 0xff] ^
   1026 		Td1[(s2 >>  8) & 0xff] ^
   1027 		Td2[(s1 >> 16) & 0xff] ^
   1028 		Td3[(s0 >> 24)       ] ^
   1029 		rk[3];
   1030 #endif
   1031 	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
   1032     }
   1033     /*
   1034 	 * apply last round and
   1035 	 * map cipher state to byte array block:
   1036 	 */
   1037 	prefetch256(Td4);
   1038 
   1039 	*(u32*)(out+0) =
   1040 		(Td4[(s0      ) & 0xff])	^
   1041 		(Td4[(s3 >>  8) & 0xff] <<  8) ^
   1042 		(Td4[(s2 >> 16) & 0xff] << 16) ^
   1043 		(Td4[(s1 >> 24)       ] << 24) ^
   1044 		rk[0];
   1045 	*(u32*)(out+4) =
   1046 		(Td4[(s1      ) & 0xff])	 ^
   1047 		(Td4[(s0 >>  8) & 0xff] <<  8) ^
   1048 		(Td4[(s3 >> 16) & 0xff] << 16) ^
   1049 		(Td4[(s2 >> 24)       ] << 24) ^
   1050 		rk[1];
   1051 	*(u32*)(out+8) =
   1052 		(Td4[(s2      ) & 0xff])	 ^
   1053 		(Td4[(s1 >>  8) & 0xff] <<  8) ^
   1054 		(Td4[(s0 >> 16) & 0xff] << 16) ^
   1055 		(Td4[(s3 >> 24)       ] << 24) ^
   1056 		rk[2];
   1057 	*(u32*)(out+12) =
   1058 		(Td4[(s3      ) & 0xff])	 ^
   1059 		(Td4[(s2 >>  8) & 0xff] <<  8) ^
   1060 		(Td4[(s1 >> 16) & 0xff] << 16) ^
   1061 		(Td4[(s0 >> 24)       ] << 24) ^
   1062 		rk[3];
   1063 }
   1064