Home | History | Annotate | Download | only in engines
      1 package org.bouncycastle.crypto.engines;
      2 
      3 import org.bouncycastle.crypto.BlockCipher;
      4 import org.bouncycastle.crypto.CipherParameters;
      5 import org.bouncycastle.crypto.DataLengthException;
      6 import org.bouncycastle.crypto.params.KeyParameter;
      7 
      8 /**
      9  * an implementation of the AES (Rijndael), from FIPS-197.
     10  * <p>
     11  * For further details see: <a href="http://csrc.nist.gov/encryption/aes/">http://csrc.nist.gov/encryption/aes/</a>.
     12  *
     13  * This implementation is based on optimizations from Dr. Brian Gladman's paper and C code at
     14  * <a href="http://fp.gladman.plus.com/cryptography_technology/rijndael/">http://fp.gladman.plus.com/cryptography_technology/rijndael/</a>
     15  *
     16  * There are three levels of tradeoff of speed vs memory
     17  * Because java has no preprocessor, they are written as three separate classes from which to choose
     18  *
     19  * The fastest uses 8Kbytes of static tables to precompute round calculations, 4 256 word tables for encryption
     20  * and 4 for decryption.
     21  *
     22  * The middle performance version uses only one 256 word table for each, for a total of 2Kbytes,
     23  * adding 12 rotate operations per round to compute the values contained in the other tables from
     24  * the contents of the first
     25  *
     26  * The slowest version uses no static tables at all and computes the values
     27  * in each round.
     28  * <p>
     29  * This file contains the slowest performance version with no static tables
     30  * for round precomputation, but it has the smallest foot print.
     31  *
     32  */
     33 public class AESLightEngine
     34     implements BlockCipher
     35 {
     36     // The S box
     37     private static final byte[] S = {
     38         (byte)99, (byte)124, (byte)119, (byte)123, (byte)242, (byte)107, (byte)111, (byte)197,
     39         (byte)48,   (byte)1, (byte)103,  (byte)43, (byte)254, (byte)215, (byte)171, (byte)118,
     40         (byte)202, (byte)130, (byte)201, (byte)125, (byte)250,  (byte)89,  (byte)71, (byte)240,
     41         (byte)173, (byte)212, (byte)162, (byte)175, (byte)156, (byte)164, (byte)114, (byte)192,
     42         (byte)183, (byte)253, (byte)147,  (byte)38,  (byte)54,  (byte)63, (byte)247, (byte)204,
     43         (byte)52, (byte)165, (byte)229, (byte)241, (byte)113, (byte)216,  (byte)49,  (byte)21,
     44         (byte)4, (byte)199,  (byte)35, (byte)195,  (byte)24, (byte)150,   (byte)5, (byte)154,
     45         (byte)7,  (byte)18, (byte)128, (byte)226, (byte)235,  (byte)39, (byte)178, (byte)117,
     46         (byte)9, (byte)131,  (byte)44,  (byte)26,  (byte)27, (byte)110,  (byte)90, (byte)160,
     47         (byte)82,  (byte)59, (byte)214, (byte)179,  (byte)41, (byte)227,  (byte)47, (byte)132,
     48         (byte)83, (byte)209,   (byte)0, (byte)237,  (byte)32, (byte)252, (byte)177,  (byte)91,
     49         (byte)106, (byte)203, (byte)190,  (byte)57,  (byte)74,  (byte)76,  (byte)88, (byte)207,
     50         (byte)208, (byte)239, (byte)170, (byte)251,  (byte)67,  (byte)77,  (byte)51, (byte)133,
     51         (byte)69, (byte)249,   (byte)2, (byte)127,  (byte)80,  (byte)60, (byte)159, (byte)168,
     52         (byte)81, (byte)163,  (byte)64, (byte)143, (byte)146, (byte)157,  (byte)56, (byte)245,
     53         (byte)188, (byte)182, (byte)218,  (byte)33,  (byte)16, (byte)255, (byte)243, (byte)210,
     54         (byte)205,  (byte)12,  (byte)19, (byte)236,  (byte)95, (byte)151,  (byte)68,  (byte)23,
     55         (byte)196, (byte)167, (byte)126,  (byte)61, (byte)100,  (byte)93,  (byte)25, (byte)115,
     56         (byte)96, (byte)129,  (byte)79, (byte)220,  (byte)34,  (byte)42, (byte)144, (byte)136,
     57         (byte)70, (byte)238, (byte)184,  (byte)20, (byte)222,  (byte)94,  (byte)11, (byte)219,
     58         (byte)224,  (byte)50,  (byte)58,  (byte)10,  (byte)73,   (byte)6,  (byte)36,  (byte)92,
     59         (byte)194, (byte)211, (byte)172,  (byte)98, (byte)145, (byte)149, (byte)228, (byte)121,
     60         (byte)231, (byte)200,  (byte)55, (byte)109, (byte)141, (byte)213,  (byte)78, (byte)169,
     61         (byte)108,  (byte)86, (byte)244, (byte)234, (byte)101, (byte)122, (byte)174,   (byte)8,
     62         (byte)186, (byte)120,  (byte)37,  (byte)46,  (byte)28, (byte)166, (byte)180, (byte)198,
     63         (byte)232, (byte)221, (byte)116,  (byte)31,  (byte)75, (byte)189, (byte)139, (byte)138,
     64         (byte)112,  (byte)62, (byte)181, (byte)102,  (byte)72,   (byte)3, (byte)246,  (byte)14,
     65         (byte)97,  (byte)53,  (byte)87, (byte)185, (byte)134, (byte)193,  (byte)29, (byte)158,
     66         (byte)225, (byte)248, (byte)152,  (byte)17, (byte)105, (byte)217, (byte)142, (byte)148,
     67         (byte)155,  (byte)30, (byte)135, (byte)233, (byte)206,  (byte)85,  (byte)40, (byte)223,
     68         (byte)140, (byte)161, (byte)137,  (byte)13, (byte)191, (byte)230,  (byte)66, (byte)104,
     69         (byte)65, (byte)153,  (byte)45,  (byte)15, (byte)176,  (byte)84, (byte)187,  (byte)22,
     70     };
     71 
     72     // The inverse S-box
     73     private static final byte[] Si = {
     74         (byte)82,   (byte)9, (byte)106, (byte)213,  (byte)48,  (byte)54, (byte)165,  (byte)56,
     75         (byte)191,  (byte)64, (byte)163, (byte)158, (byte)129, (byte)243, (byte)215, (byte)251,
     76         (byte)124, (byte)227,  (byte)57, (byte)130, (byte)155,  (byte)47, (byte)255, (byte)135,
     77         (byte)52, (byte)142,  (byte)67,  (byte)68, (byte)196, (byte)222, (byte)233, (byte)203,
     78         (byte)84, (byte)123, (byte)148,  (byte)50, (byte)166, (byte)194,  (byte)35,  (byte)61,
     79         (byte)238,  (byte)76, (byte)149,  (byte)11,  (byte)66, (byte)250, (byte)195,  (byte)78,
     80         (byte)8,  (byte)46, (byte)161, (byte)102,  (byte)40, (byte)217,  (byte)36, (byte)178,
     81         (byte)118,  (byte)91, (byte)162,  (byte)73, (byte)109, (byte)139, (byte)209,  (byte)37,
     82         (byte)114, (byte)248, (byte)246, (byte)100, (byte)134, (byte)104, (byte)152,  (byte)22,
     83         (byte)212, (byte)164,  (byte)92, (byte)204,  (byte)93, (byte)101, (byte)182, (byte)146,
     84         (byte)108, (byte)112,  (byte)72,  (byte)80, (byte)253, (byte)237, (byte)185, (byte)218,
     85         (byte)94,  (byte)21,  (byte)70,  (byte)87, (byte)167, (byte)141, (byte)157, (byte)132,
     86         (byte)144, (byte)216, (byte)171,   (byte)0, (byte)140, (byte)188, (byte)211,  (byte)10,
     87         (byte)247, (byte)228,  (byte)88,   (byte)5, (byte)184, (byte)179,  (byte)69,   (byte)6,
     88         (byte)208,  (byte)44,  (byte)30, (byte)143, (byte)202,  (byte)63,  (byte)15,   (byte)2,
     89         (byte)193, (byte)175, (byte)189,   (byte)3,   (byte)1,  (byte)19, (byte)138, (byte)107,
     90         (byte)58, (byte)145,  (byte)17,  (byte)65,  (byte)79, (byte)103, (byte)220, (byte)234,
     91         (byte)151, (byte)242, (byte)207, (byte)206, (byte)240, (byte)180, (byte)230, (byte)115,
     92         (byte)150, (byte)172, (byte)116,  (byte)34, (byte)231, (byte)173,  (byte)53, (byte)133,
     93         (byte)226, (byte)249,  (byte)55, (byte)232,  (byte)28, (byte)117, (byte)223, (byte)110,
     94         (byte)71, (byte)241,  (byte)26, (byte)113,  (byte)29,  (byte)41, (byte)197, (byte)137,
     95         (byte)111, (byte)183,  (byte)98,  (byte)14, (byte)170,  (byte)24, (byte)190,  (byte)27,
     96         (byte)252,  (byte)86,  (byte)62,  (byte)75, (byte)198, (byte)210, (byte)121,  (byte)32,
     97         (byte)154, (byte)219, (byte)192, (byte)254, (byte)120, (byte)205,  (byte)90, (byte)244,
     98         (byte)31, (byte)221, (byte)168,  (byte)51, (byte)136,   (byte)7, (byte)199,  (byte)49,
     99         (byte)177,  (byte)18,  (byte)16,  (byte)89,  (byte)39, (byte)128, (byte)236,  (byte)95,
    100         (byte)96,  (byte)81, (byte)127, (byte)169,  (byte)25, (byte)181,  (byte)74,  (byte)13,
    101         (byte)45, (byte)229, (byte)122, (byte)159, (byte)147, (byte)201, (byte)156, (byte)239,
    102         (byte)160, (byte)224,  (byte)59,  (byte)77, (byte)174,  (byte)42, (byte)245, (byte)176,
    103         (byte)200, (byte)235, (byte)187,  (byte)60, (byte)131,  (byte)83, (byte)153,  (byte)97,
    104         (byte)23,  (byte)43,   (byte)4, (byte)126, (byte)186, (byte)119, (byte)214,  (byte)38,
    105         (byte)225, (byte)105,  (byte)20,  (byte)99,  (byte)85,  (byte)33,  (byte)12, (byte)125,
    106         };
    107 
    108     // vector used in calculating key schedule (powers of x in GF(256))
    109     private static final int[] rcon = {
    110          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
    111          0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 };
    112 
    113     private int shift(
    114         int     r,
    115         int     shift)
    116     {
    117         return (r >>> shift) | (r << -shift);
    118     }
    119 
    120     /* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
    121 
    122     private static final int m1 = 0x80808080;
    123     private static final int m2 = 0x7f7f7f7f;
    124     private static final int m3 = 0x0000001b;
    125 
    126     private int FFmulX(int x)
    127     {
    128         return (((x & m2) << 1) ^ (((x & m1) >>> 7) * m3));
    129     }
    130 
    131     /*
    132        The following defines provide alternative definitions of FFmulX that might
    133        give improved performance if a fast 32-bit multiply is not available.
    134 
    135        private int FFmulX(int x) { int u = x & m1; u |= (u >> 1); return ((x & m2) << 1) ^ ((u >>> 3) | (u >>> 6)); }
    136        private static final int  m4 = 0x1b1b1b1b;
    137        private int FFmulX(int x) { int u = x & m1; return ((x & m2) << 1) ^ ((u - (u >>> 7)) & m4); }
    138 
    139     */
    140 
    141     private int mcol(int x)
    142     {
    143         int f2 = FFmulX(x);
    144         return f2 ^ shift(x ^ f2, 8) ^ shift(x, 16) ^ shift(x, 24);
    145     }
    146 
    147     private int inv_mcol(int x)
    148     {
    149         int f2 = FFmulX(x);
    150         int f4 = FFmulX(f2);
    151         int f8 = FFmulX(f4);
    152         int f9 = x ^ f8;
    153 
    154         return f2 ^ f4 ^ f8 ^ shift(f2 ^ f9, 8) ^ shift(f4 ^ f9, 16) ^ shift(f9, 24);
    155     }
    156 
    157 
    158     private int subWord(int x)
    159     {
    160         return (S[x&255]&255 | ((S[(x>>8)&255]&255)<<8) | ((S[(x>>16)&255]&255)<<16) | S[(x>>24)&255]<<24);
    161     }
    162 
    163     /**
    164      * Calculate the necessary round keys
    165      * The number of calculations depends on key size and block size
    166      * AES specified a fixed block size of 128 bits and key sizes 128/192/256 bits
    167      * This code is written assuming those are the only possible values
    168      */
    169     private int[][] generateWorkingKey(
    170                                     byte[] key,
    171                                     boolean forEncryption)
    172     {
    173         int         KC = key.length / 4;  // key length in words
    174         int         t;
    175 
    176         if (((KC != 4) && (KC != 6) && (KC != 8)) || ((KC * 4) != key.length))
    177         {
    178             throw new IllegalArgumentException("Key length not 128/192/256 bits.");
    179         }
    180 
    181         ROUNDS = KC + 6;  // This is not always true for the generalized Rijndael that allows larger block sizes
    182         int[][] W = new int[ROUNDS+1][4];   // 4 words in a block
    183 
    184         //
    185         // copy the key into the round key array
    186         //
    187 
    188         t = 0;
    189         int i = 0;
    190         while (i < key.length)
    191             {
    192                 W[t >> 2][t & 3] = (key[i]&0xff) | ((key[i+1]&0xff) << 8) | ((key[i+2]&0xff) << 16) | (key[i+3] << 24);
    193                 i+=4;
    194                 t++;
    195             }
    196 
    197         //
    198         // while not enough round key material calculated
    199         // calculate new values
    200         //
    201         int k = (ROUNDS + 1) << 2;
    202         for (i = KC; (i < k); i++)
    203             {
    204                 int temp = W[(i-1)>>2][(i-1)&3];
    205                 if ((i % KC) == 0)
    206                 {
    207                     temp = subWord(shift(temp, 8)) ^ rcon[(i / KC)-1];
    208                 }
    209                 else if ((KC > 6) && ((i % KC) == 4))
    210                 {
    211                     temp = subWord(temp);
    212                 }
    213 
    214                 W[i>>2][i&3] = W[(i - KC)>>2][(i-KC)&3] ^ temp;
    215             }
    216 
    217         if (!forEncryption)
    218         {
    219             for (int j = 1; j < ROUNDS; j++)
    220             {
    221                 for (i = 0; i < 4; i++)
    222                 {
    223                     W[j][i] = inv_mcol(W[j][i]);
    224                 }
    225             }
    226         }
    227 
    228         return W;
    229     }
    230 
    231     private int         ROUNDS;
    232     private int[][]     WorkingKey = null;
    233     private int         C0, C1, C2, C3;
    234     private boolean     forEncryption;
    235 
    236     private static final int BLOCK_SIZE = 16;
    237 
    238     /**
    239      * default constructor - 128 bit block size.
    240      */
    241     public AESLightEngine()
    242     {
    243     }
    244 
    245     /**
    246      * initialise an AES cipher.
    247      *
    248      * @param forEncryption whether or not we are for encryption.
    249      * @param params the parameters required to set up the cipher.
    250      * @exception IllegalArgumentException if the params argument is
    251      * inappropriate.
    252      */
    253     public void init(
    254         boolean           forEncryption,
    255         CipherParameters  params)
    256     {
    257         if (params instanceof KeyParameter)
    258         {
    259             WorkingKey = generateWorkingKey(((KeyParameter)params).getKey(), forEncryption);
    260             this.forEncryption = forEncryption;
    261             return;
    262         }
    263 
    264         throw new IllegalArgumentException("invalid parameter passed to AES init - " + params.getClass().getName());
    265     }
    266 
    267     public String getAlgorithmName()
    268     {
    269         return "AES";
    270     }
    271 
    272     public int getBlockSize()
    273     {
    274         return BLOCK_SIZE;
    275     }
    276 
    277     public int processBlock(
    278         byte[] in,
    279         int inOff,
    280         byte[] out,
    281         int outOff)
    282     {
    283         if (WorkingKey == null)
    284         {
    285             throw new IllegalStateException("AES engine not initialised");
    286         }
    287 
    288         if ((inOff + (32 / 2)) > in.length)
    289         {
    290             throw new DataLengthException("input buffer too short");
    291         }
    292 
    293         if ((outOff + (32 / 2)) > out.length)
    294         {
    295             throw new DataLengthException("output buffer too short");
    296         }
    297 
    298         if (forEncryption)
    299         {
    300             unpackBlock(in, inOff);
    301             encryptBlock(WorkingKey);
    302             packBlock(out, outOff);
    303         }
    304         else
    305         {
    306             unpackBlock(in, inOff);
    307             decryptBlock(WorkingKey);
    308             packBlock(out, outOff);
    309         }
    310 
    311         return BLOCK_SIZE;
    312     }
    313 
    314     public void reset()
    315     {
    316     }
    317 
    318     private final void unpackBlock(
    319         byte[]      bytes,
    320         int         off)
    321     {
    322         int     index = off;
    323 
    324         C0 = (bytes[index++] & 0xff);
    325         C0 |= (bytes[index++] & 0xff) << 8;
    326         C0 |= (bytes[index++] & 0xff) << 16;
    327         C0 |= bytes[index++] << 24;
    328 
    329         C1 = (bytes[index++] & 0xff);
    330         C1 |= (bytes[index++] & 0xff) << 8;
    331         C1 |= (bytes[index++] & 0xff) << 16;
    332         C1 |= bytes[index++] << 24;
    333 
    334         C2 = (bytes[index++] & 0xff);
    335         C2 |= (bytes[index++] & 0xff) << 8;
    336         C2 |= (bytes[index++] & 0xff) << 16;
    337         C2 |= bytes[index++] << 24;
    338 
    339         C3 = (bytes[index++] & 0xff);
    340         C3 |= (bytes[index++] & 0xff) << 8;
    341         C3 |= (bytes[index++] & 0xff) << 16;
    342         C3 |= bytes[index++] << 24;
    343     }
    344 
    345     private final void packBlock(
    346         byte[]      bytes,
    347         int         off)
    348     {
    349         int     index = off;
    350 
    351         bytes[index++] = (byte)C0;
    352         bytes[index++] = (byte)(C0 >> 8);
    353         bytes[index++] = (byte)(C0 >> 16);
    354         bytes[index++] = (byte)(C0 >> 24);
    355 
    356         bytes[index++] = (byte)C1;
    357         bytes[index++] = (byte)(C1 >> 8);
    358         bytes[index++] = (byte)(C1 >> 16);
    359         bytes[index++] = (byte)(C1 >> 24);
    360 
    361         bytes[index++] = (byte)C2;
    362         bytes[index++] = (byte)(C2 >> 8);
    363         bytes[index++] = (byte)(C2 >> 16);
    364         bytes[index++] = (byte)(C2 >> 24);
    365 
    366         bytes[index++] = (byte)C3;
    367         bytes[index++] = (byte)(C3 >> 8);
    368         bytes[index++] = (byte)(C3 >> 16);
    369         bytes[index++] = (byte)(C3 >> 24);
    370     }
    371 
    372     private void encryptBlock(int[][] KW)
    373     {
    374         int r, r0, r1, r2, r3;
    375 
    376         C0 ^= KW[0][0];
    377         C1 ^= KW[0][1];
    378         C2 ^= KW[0][2];
    379         C3 ^= KW[0][3];
    380 
    381         for (r = 1; r < ROUNDS - 1;)
    382         {
    383             r0 = mcol((S[C0&255]&255) ^ ((S[(C1>>8)&255]&255)<<8) ^ ((S[(C2>>16)&255]&255)<<16) ^ (S[(C3>>24)&255]<<24)) ^ KW[r][0];
    384             r1 = mcol((S[C1&255]&255) ^ ((S[(C2>>8)&255]&255)<<8) ^ ((S[(C3>>16)&255]&255)<<16) ^ (S[(C0>>24)&255]<<24)) ^ KW[r][1];
    385             r2 = mcol((S[C2&255]&255) ^ ((S[(C3>>8)&255]&255)<<8) ^ ((S[(C0>>16)&255]&255)<<16) ^ (S[(C1>>24)&255]<<24)) ^ KW[r][2];
    386             r3 = mcol((S[C3&255]&255) ^ ((S[(C0>>8)&255]&255)<<8) ^ ((S[(C1>>16)&255]&255)<<16) ^ (S[(C2>>24)&255]<<24)) ^ KW[r++][3];
    387             C0 = mcol((S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24)) ^ KW[r][0];
    388             C1 = mcol((S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24)) ^ KW[r][1];
    389             C2 = mcol((S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24)) ^ KW[r][2];
    390             C3 = mcol((S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24)) ^ KW[r++][3];
    391         }
    392 
    393         r0 = mcol((S[C0&255]&255) ^ ((S[(C1>>8)&255]&255)<<8) ^ ((S[(C2>>16)&255]&255)<<16) ^ (S[(C3>>24)&255]<<24)) ^ KW[r][0];
    394         r1 = mcol((S[C1&255]&255) ^ ((S[(C2>>8)&255]&255)<<8) ^ ((S[(C3>>16)&255]&255)<<16) ^ (S[(C0>>24)&255]<<24)) ^ KW[r][1];
    395         r2 = mcol((S[C2&255]&255) ^ ((S[(C3>>8)&255]&255)<<8) ^ ((S[(C0>>16)&255]&255)<<16) ^ (S[(C1>>24)&255]<<24)) ^ KW[r][2];
    396         r3 = mcol((S[C3&255]&255) ^ ((S[(C0>>8)&255]&255)<<8) ^ ((S[(C1>>16)&255]&255)<<16) ^ (S[(C2>>24)&255]<<24)) ^ KW[r++][3];
    397 
    398         // the final round is a simple function of S
    399 
    400         C0 = (S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24) ^ KW[r][0];
    401         C1 = (S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24) ^ KW[r][1];
    402         C2 = (S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24) ^ KW[r][2];
    403         C3 = (S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24) ^ KW[r][3];
    404 
    405     }
    406 
    407     private final void decryptBlock(int[][] KW)
    408     {
    409         int r, r0, r1, r2, r3;
    410 
    411         C0 ^= KW[ROUNDS][0];
    412         C1 ^= KW[ROUNDS][1];
    413         C2 ^= KW[ROUNDS][2];
    414         C3 ^= KW[ROUNDS][3];
    415 
    416         for (r = ROUNDS-1; r>1;)
    417         {
    418             r0 = inv_mcol((Si[C0&255]&255) ^ ((Si[(C3>>8)&255]&255)<<8) ^ ((Si[(C2>>16)&255]&255)<<16) ^ (Si[(C1>>24)&255]<<24)) ^ KW[r][0];
    419             r1 = inv_mcol((Si[C1&255]&255) ^ ((Si[(C0>>8)&255]&255)<<8) ^ ((Si[(C3>>16)&255]&255)<<16) ^ (Si[(C2>>24)&255]<<24)) ^ KW[r][1];
    420             r2 = inv_mcol((Si[C2&255]&255) ^ ((Si[(C1>>8)&255]&255)<<8) ^ ((Si[(C0>>16)&255]&255)<<16) ^ (Si[(C3>>24)&255]<<24)) ^ KW[r][2];
    421             r3 = inv_mcol((Si[C3&255]&255) ^ ((Si[(C2>>8)&255]&255)<<8) ^ ((Si[(C1>>16)&255]&255)<<16) ^ (Si[(C0>>24)&255]<<24)) ^ KW[r--][3];
    422             C0 = inv_mcol((Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24)) ^ KW[r][0];
    423             C1 = inv_mcol((Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24)) ^ KW[r][1];
    424             C2 = inv_mcol((Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24)) ^ KW[r][2];
    425             C3 = inv_mcol((Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24)) ^ KW[r--][3];
    426         }
    427 
    428         r0 = inv_mcol((Si[C0&255]&255) ^ ((Si[(C3>>8)&255]&255)<<8) ^ ((Si[(C2>>16)&255]&255)<<16) ^ (Si[(C1>>24)&255]<<24)) ^ KW[r][0];
    429         r1 = inv_mcol((Si[C1&255]&255) ^ ((Si[(C0>>8)&255]&255)<<8) ^ ((Si[(C3>>16)&255]&255)<<16) ^ (Si[(C2>>24)&255]<<24)) ^ KW[r][1];
    430         r2 = inv_mcol((Si[C2&255]&255) ^ ((Si[(C1>>8)&255]&255)<<8) ^ ((Si[(C0>>16)&255]&255)<<16) ^ (Si[(C3>>24)&255]<<24)) ^ KW[r][2];
    431         r3 = inv_mcol((Si[C3&255]&255) ^ ((Si[(C2>>8)&255]&255)<<8) ^ ((Si[(C1>>16)&255]&255)<<16) ^ (Si[(C0>>24)&255]<<24)) ^ KW[r--][3];
    432 
    433         // the final round's table is a simple function of Si
    434 
    435         C0 = (Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24) ^ KW[0][0];
    436         C1 = (Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24) ^ KW[0][1];
    437         C2 = (Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24) ^ KW[0][2];
    438         C3 = (Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24) ^ KW[0][3];
    439     }
    440 }
    441