Home | History | Annotate | Download | only in sha
      1 /* Copyright (C) 1995-1998 Eric Young (eay (at) cryptsoft.com)
      2  * All rights reserved.
      3  *
      4  * This package is an SSL implementation written
      5  * by Eric Young (eay (at) cryptsoft.com).
      6  * The implementation was written so as to conform with Netscapes SSL.
      7  *
      8  * This library is free for commercial and non-commercial use as long as
      9  * the following conditions are aheared to.  The following conditions
     10  * apply to all code found in this distribution, be it the RC4, RSA,
     11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
     12  * included with this distribution is covered by the same copyright terms
     13  * except that the holder is Tim Hudson (tjh (at) cryptsoft.com).
     14  *
     15  * Copyright remains Eric Young's, and as such any Copyright notices in
     16  * the code are not to be removed.
     17  * If this package is used in a product, Eric Young should be given attribution
     18  * as the author of the parts of the library used.
     19  * This can be in the form of a textual message at program startup or
     20  * in documentation (online or textual) provided with the package.
     21  *
     22  * Redistribution and use in source and binary forms, with or without
     23  * modification, are permitted provided that the following conditions
     24  * are met:
     25  * 1. Redistributions of source code must retain the copyright
     26  *    notice, this list of conditions and the following disclaimer.
     27  * 2. Redistributions in binary form must reproduce the above copyright
     28  *    notice, this list of conditions and the following disclaimer in the
     29  *    documentation and/or other materials provided with the distribution.
     30  * 3. All advertising materials mentioning features or use of this software
     31  *    must display the following acknowledgement:
     32  *    "This product includes cryptographic software written by
     33  *     Eric Young (eay (at) cryptsoft.com)"
     34  *    The word 'cryptographic' can be left out if the rouines from the library
     35  *    being used are not cryptographic related :-).
     36  * 4. If you include any Windows specific code (or a derivative thereof) from
     37  *    the apps directory (application code) you must include an acknowledgement:
     38  *    "This product includes software written by Tim Hudson (tjh (at) cryptsoft.com)"
     39  *
     40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
     41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     50  * SUCH DAMAGE.
     51  *
     52  * The licence and distribution terms for any publically available version or
     53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
     54  * copied and put under another distribution licence
     55  * [including the GNU Public Licence.] */
     56 
     57 // Altivec-optimized SHA1 in C. This is tested on ppc64le only.
     58 //
     59 // References:
     60 // https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
     61 // http://arctic.org/~dean/crypto/sha1.html
     62 //
     63 // This code used the generic SHA-1 from OpenSSL as a basis and AltiVec
     64 // optimisations were added on top.
     65 
     66 #include <openssl/sha.h>
     67 
     68 #if defined(OPENSSL_PPC64LE)
     69 
     70 #include <altivec.h>
     71 
     72 void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
     73 
     74 static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); }
     75 
     76 typedef vector unsigned int vec_uint32_t;
     77 typedef vector unsigned char vec_uint8_t;
     78 
     79 // Vector constants
     80 static const vec_uint8_t k_swap_endianness = {3,  2,  1, 0, 7,  6,  5,  4,
     81                                               11, 10, 9, 8, 15, 14, 13, 12};
     82 
     83 // Shift amounts for byte and bit shifts and rotations
     84 static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32,
     85                                       32, 32, 32, 32, 32, 32, 32, 32};
     86 static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96,
     87                                        96, 96, 96, 96, 96, 96, 96, 96};
     88 
     89 #define K_00_19 0x5a827999UL
     90 #define K_20_39 0x6ed9eba1UL
     91 #define K_40_59 0x8f1bbcdcUL
     92 #define K_60_79 0xca62c1d6UL
     93 
     94 // Vector versions of the above.
     95 static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19};
     96 static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39};
     97 static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59};
     98 static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79};
     99 
    100 // vector message scheduling: compute message schedule for round i..i+3 where i
    101 // is divisible by 4. We return the schedule w[i..i+3] as a vector. In
    102 // addition, we also precompute sum w[i..+3] and an additive constant K. This
    103 // is done to offload some computation of f() in the integer execution units.
    104 //
    105 // Byte shifting code below may not be correct for big-endian systems.
    106 static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data,
    107                                 vec_uint32_t k) {
    108   const vector unsigned char unaligned_data =
    109     vec_vsx_ld(0, (const unsigned char*) data);
    110   const vec_uint32_t v = (vec_uint32_t) unaligned_data;
    111   const vec_uint32_t w = vec_perm(v, v, k_swap_endianness);
    112   vec_st(w + k, 0, pre_added);
    113   return w;
    114 }
    115 
    116 // Compute w[i..i+3] using these steps for i in [16, 20, 24, 28]
    117 //
    118 // w'[i  ]  = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1
    119 // w'[i+1]  = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1
    120 // w'[i+2]  = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1
    121 // w'[i+3]  = (     0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1
    122 //
    123 // w[  i] = w'[  i]
    124 // w[i+1] = w'[i+1]
    125 // w[i+2] = w'[i+2]
    126 // w[i+3] = w'[i+3] ^ (w'[i] <<< 1)
    127 static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4,
    128                                 vec_uint32_t minus_8, vec_uint32_t minus_12,
    129                                 vec_uint32_t minus_16, vec_uint32_t k) {
    130   const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes);
    131   const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8);
    132   const vec_uint32_t k_1_bit = vec_splat_u32(1);
    133   const vec_uint32_t w_prime =
    134       vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit);
    135   const vec_uint32_t w =
    136       w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit);
    137   vec_st(w + k, 0, pre_added);
    138   return w;
    139 }
    140 
    141 // Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76]
    142 // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2
    143 static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4,
    144                                 vec_uint32_t minus_8, vec_uint32_t minus_16,
    145                                 vec_uint32_t minus_28, vec_uint32_t minus_32,
    146                                 vec_uint32_t k) {
    147   const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8);
    148   const vec_uint32_t k_2_bits = vec_splat_u32(2);
    149   const vec_uint32_t w =
    150       vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits);
    151   vec_st(w + k, 0, pre_added);
    152   return w;
    153 }
    154 
    155 // As pointed out by Wei Dai <weidai (at) eskimo.com>, F() below can be simplified
    156 // to the code in F_00_19. Wei attributes these optimisations to Peter
    157 // Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
    158 // F(x,y,z) (((x) & (y))  |  ((~(x)) & (z))) I've just become aware of another
    159 // tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a
    160 #define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
    161 #define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
    162 #define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
    163 #define F_60_79(b, c, d) F_20_39(b, c, d)
    164 
    165 // We pre-added the K constants during message scheduling.
    166 #define BODY_00_19(i, a, b, c, d, e, f)                         \
    167   do {                                                          \
    168     (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \
    169     (b) = rotate((b), 30);                                      \
    170   } while (0)
    171 
    172 #define BODY_20_39(i, a, b, c, d, e, f)                         \
    173   do {                                                          \
    174     (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \
    175     (b) = rotate((b), 30);                                      \
    176   } while (0)
    177 
    178 #define BODY_40_59(i, a, b, c, d, e, f)                         \
    179   do {                                                          \
    180     (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \
    181     (b) = rotate((b), 30);                                      \
    182   } while (0)
    183 
    184 #define BODY_60_79(i, a, b, c, d, e, f)                         \
    185   do {                                                          \
    186     (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \
    187     (b) = rotate((b), 30);                                      \
    188   } while (0)
    189 
    190 void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
    191   uint32_t A, B, C, D, E, T;
    192 
    193   A = state[0];
    194   B = state[1];
    195   C = state[2];
    196   D = state[3];
    197   E = state[4];
    198 
    199   for (;;) {
    200     vec_uint32_t vw[20];
    201     const uint32_t *w = (const uint32_t *)&vw;
    202 
    203     vec_uint32_t k = K_00_19_x_4;
    204     const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k);
    205     BODY_00_19(0, A, B, C, D, E, T);
    206     BODY_00_19(1, T, A, B, C, D, E);
    207     BODY_00_19(2, E, T, A, B, C, D);
    208     BODY_00_19(3, D, E, T, A, B, C);
    209 
    210     const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k);
    211     BODY_00_19(4, C, D, E, T, A, B);
    212     BODY_00_19(5, B, C, D, E, T, A);
    213     BODY_00_19(6, A, B, C, D, E, T);
    214     BODY_00_19(7, T, A, B, C, D, E);
    215 
    216     const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k);
    217     BODY_00_19(8, E, T, A, B, C, D);
    218     BODY_00_19(9, D, E, T, A, B, C);
    219     BODY_00_19(10, C, D, E, T, A, B);
    220     BODY_00_19(11, B, C, D, E, T, A);
    221 
    222     const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k);
    223     BODY_00_19(12, A, B, C, D, E, T);
    224     BODY_00_19(13, T, A, B, C, D, E);
    225     BODY_00_19(14, E, T, A, B, C, D);
    226     BODY_00_19(15, D, E, T, A, B, C);
    227 
    228     const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k);
    229     BODY_00_19(16, C, D, E, T, A, B);
    230     BODY_00_19(17, B, C, D, E, T, A);
    231     BODY_00_19(18, A, B, C, D, E, T);
    232     BODY_00_19(19, T, A, B, C, D, E);
    233 
    234     k = K_20_39_x_4;
    235     const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k);
    236     BODY_20_39(20, E, T, A, B, C, D);
    237     BODY_20_39(21, D, E, T, A, B, C);
    238     BODY_20_39(22, C, D, E, T, A, B);
    239     BODY_20_39(23, B, C, D, E, T, A);
    240 
    241     const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k);
    242     BODY_20_39(24, A, B, C, D, E, T);
    243     BODY_20_39(25, T, A, B, C, D, E);
    244     BODY_20_39(26, E, T, A, B, C, D);
    245     BODY_20_39(27, D, E, T, A, B, C);
    246 
    247     const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k);
    248     BODY_20_39(28, C, D, E, T, A, B);
    249     BODY_20_39(29, B, C, D, E, T, A);
    250     BODY_20_39(30, A, B, C, D, E, T);
    251     BODY_20_39(31, T, A, B, C, D, E);
    252 
    253     const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k);
    254     BODY_20_39(32, E, T, A, B, C, D);
    255     BODY_20_39(33, D, E, T, A, B, C);
    256     BODY_20_39(34, C, D, E, T, A, B);
    257     BODY_20_39(35, B, C, D, E, T, A);
    258 
    259     const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k);
    260     BODY_20_39(36, A, B, C, D, E, T);
    261     BODY_20_39(37, T, A, B, C, D, E);
    262     BODY_20_39(38, E, T, A, B, C, D);
    263     BODY_20_39(39, D, E, T, A, B, C);
    264 
    265     k = K_40_59_x_4;
    266     const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k);
    267     BODY_40_59(40, C, D, E, T, A, B);
    268     BODY_40_59(41, B, C, D, E, T, A);
    269     BODY_40_59(42, A, B, C, D, E, T);
    270     BODY_40_59(43, T, A, B, C, D, E);
    271 
    272     const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k);
    273     BODY_40_59(44, E, T, A, B, C, D);
    274     BODY_40_59(45, D, E, T, A, B, C);
    275     BODY_40_59(46, C, D, E, T, A, B);
    276     BODY_40_59(47, B, C, D, E, T, A);
    277 
    278     const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k);
    279     BODY_40_59(48, A, B, C, D, E, T);
    280     BODY_40_59(49, T, A, B, C, D, E);
    281     BODY_40_59(50, E, T, A, B, C, D);
    282     BODY_40_59(51, D, E, T, A, B, C);
    283 
    284     const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k);
    285     BODY_40_59(52, C, D, E, T, A, B);
    286     BODY_40_59(53, B, C, D, E, T, A);
    287     BODY_40_59(54, A, B, C, D, E, T);
    288     BODY_40_59(55, T, A, B, C, D, E);
    289 
    290     const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k);
    291     BODY_40_59(56, E, T, A, B, C, D);
    292     BODY_40_59(57, D, E, T, A, B, C);
    293     BODY_40_59(58, C, D, E, T, A, B);
    294     BODY_40_59(59, B, C, D, E, T, A);
    295 
    296     k = K_60_79_x_4;
    297     const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k);
    298     BODY_60_79(60, A, B, C, D, E, T);
    299     BODY_60_79(61, T, A, B, C, D, E);
    300     BODY_60_79(62, E, T, A, B, C, D);
    301     BODY_60_79(63, D, E, T, A, B, C);
    302 
    303     const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k);
    304     BODY_60_79(64, C, D, E, T, A, B);
    305     BODY_60_79(65, B, C, D, E, T, A);
    306     BODY_60_79(66, A, B, C, D, E, T);
    307     BODY_60_79(67, T, A, B, C, D, E);
    308 
    309     const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k);
    310     BODY_60_79(68, E, T, A, B, C, D);
    311     BODY_60_79(69, D, E, T, A, B, C);
    312     BODY_60_79(70, C, D, E, T, A, B);
    313     BODY_60_79(71, B, C, D, E, T, A);
    314 
    315     const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k);
    316     BODY_60_79(72, A, B, C, D, E, T);
    317     BODY_60_79(73, T, A, B, C, D, E);
    318     BODY_60_79(74, E, T, A, B, C, D);
    319     BODY_60_79(75, D, E, T, A, B, C);
    320 
    321     // We don't use the last value
    322     (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k);
    323     BODY_60_79(76, C, D, E, T, A, B);
    324     BODY_60_79(77, B, C, D, E, T, A);
    325     BODY_60_79(78, A, B, C, D, E, T);
    326     BODY_60_79(79, T, A, B, C, D, E);
    327 
    328     const uint32_t mask = 0xffffffffUL;
    329     state[0] = (state[0] + E) & mask;
    330     state[1] = (state[1] + T) & mask;
    331     state[2] = (state[2] + A) & mask;
    332     state[3] = (state[3] + B) & mask;
    333     state[4] = (state[4] + C) & mask;
    334 
    335     data += 64;
    336     if (--num == 0) {
    337       break;
    338     }
    339 
    340     A = state[0];
    341     B = state[1];
    342     C = state[2];
    343     D = state[3];
    344     E = state[4];
    345   }
    346 }
    347 
    348 #endif  // OPENSSL_PPC64LE
    349 
    350 #undef K_00_19
    351 #undef K_20_39
    352 #undef K_40_59
    353 #undef K_60_79
    354 #undef F_00_19
    355 #undef F_20_39
    356 #undef F_40_59
    357 #undef F_60_79
    358 #undef BODY_00_19
    359 #undef BODY_20_39
    360 #undef BODY_40_59
    361 #undef BODY_60_79
    362