1 /* Copyright (C) 1995-1998 Eric Young (eay (at) cryptsoft.com) 2 * All rights reserved. 3 * 4 * This package is an SSL implementation written 5 * by Eric Young (eay (at) cryptsoft.com). 6 * The implementation was written so as to conform with Netscapes SSL. 7 * 8 * This library is free for commercial and non-commercial use as long as 9 * the following conditions are aheared to. The following conditions 10 * apply to all code found in this distribution, be it the RC4, RSA, 11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 12 * included with this distribution is covered by the same copyright terms 13 * except that the holder is Tim Hudson (tjh (at) cryptsoft.com). 14 * 15 * Copyright remains Eric Young's, and as such any Copyright notices in 16 * the code are not to be removed. 17 * If this package is used in a product, Eric Young should be given attribution 18 * as the author of the parts of the library used. 19 * This can be in the form of a textual message at program startup or 20 * in documentation (online or textual) provided with the package. 21 * 22 * Redistribution and use in source and binary forms, with or without 23 * modification, are permitted provided that the following conditions 24 * are met: 25 * 1. Redistributions of source code must retain the copyright 26 * notice, this list of conditions and the following disclaimer. 27 * 2. Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in the 29 * documentation and/or other materials provided with the distribution. 30 * 3. All advertising materials mentioning features or use of this software 31 * must display the following acknowledgement: 32 * "This product includes cryptographic software written by 33 * Eric Young (eay (at) cryptsoft.com)" 34 * The word 'cryptographic' can be left out if the rouines from the library 35 * being used are not cryptographic related :-). 36 * 4. If you include any Windows specific code (or a derivative thereof) from 37 * the apps directory (application code) you must include an acknowledgement: 38 * "This product includes software written by Tim Hudson (tjh (at) cryptsoft.com)" 39 * 40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * The licence and distribution terms for any publically available version or 53 * derivative of this code cannot be changed. i.e. this code cannot simply be 54 * copied and put under another distribution licence 55 * [including the GNU Public Licence.] */ 56 57 // Altivec-optimized SHA1 in C. This is tested on ppc64le only. 58 // 59 // References: 60 // https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 61 // http://arctic.org/~dean/crypto/sha1.html 62 // 63 // This code used the generic SHA-1 from OpenSSL as a basis and AltiVec 64 // optimisations were added on top. 65 66 #include <openssl/sha.h> 67 68 #if defined(OPENSSL_PPC64LE) 69 70 #include <altivec.h> 71 72 void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num); 73 74 static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); } 75 76 typedef vector unsigned int vec_uint32_t; 77 typedef vector unsigned char vec_uint8_t; 78 79 // Vector constants 80 static const vec_uint8_t k_swap_endianness = {3, 2, 1, 0, 7, 6, 5, 4, 81 11, 10, 9, 8, 15, 14, 13, 12}; 82 83 // Shift amounts for byte and bit shifts and rotations 84 static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32, 85 32, 32, 32, 32, 32, 32, 32, 32}; 86 static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96, 87 96, 96, 96, 96, 96, 96, 96, 96}; 88 89 #define K_00_19 0x5a827999UL 90 #define K_20_39 0x6ed9eba1UL 91 #define K_40_59 0x8f1bbcdcUL 92 #define K_60_79 0xca62c1d6UL 93 94 // Vector versions of the above. 95 static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19}; 96 static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39}; 97 static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59}; 98 static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79}; 99 100 // vector message scheduling: compute message schedule for round i..i+3 where i 101 // is divisible by 4. We return the schedule w[i..i+3] as a vector. In 102 // addition, we also precompute sum w[i..+3] and an additive constant K. This 103 // is done to offload some computation of f() in the integer execution units. 104 // 105 // Byte shifting code below may not be correct for big-endian systems. 106 static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data, 107 vec_uint32_t k) { 108 const vector unsigned char unaligned_data = 109 vec_vsx_ld(0, (const unsigned char*) data); 110 const vec_uint32_t v = (vec_uint32_t) unaligned_data; 111 const vec_uint32_t w = vec_perm(v, v, k_swap_endianness); 112 vec_st(w + k, 0, pre_added); 113 return w; 114 } 115 116 // Compute w[i..i+3] using these steps for i in [16, 20, 24, 28] 117 // 118 // w'[i ] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1 119 // w'[i+1] = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1 120 // w'[i+2] = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1 121 // w'[i+3] = ( 0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1 122 // 123 // w[ i] = w'[ i] 124 // w[i+1] = w'[i+1] 125 // w[i+2] = w'[i+2] 126 // w[i+3] = w'[i+3] ^ (w'[i] <<< 1) 127 static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4, 128 vec_uint32_t minus_8, vec_uint32_t minus_12, 129 vec_uint32_t minus_16, vec_uint32_t k) { 130 const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes); 131 const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8); 132 const vec_uint32_t k_1_bit = vec_splat_u32(1); 133 const vec_uint32_t w_prime = 134 vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit); 135 const vec_uint32_t w = 136 w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit); 137 vec_st(w + k, 0, pre_added); 138 return w; 139 } 140 141 // Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76] 142 // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2 143 static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4, 144 vec_uint32_t minus_8, vec_uint32_t minus_16, 145 vec_uint32_t minus_28, vec_uint32_t minus_32, 146 vec_uint32_t k) { 147 const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8); 148 const vec_uint32_t k_2_bits = vec_splat_u32(2); 149 const vec_uint32_t w = 150 vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits); 151 vec_st(w + k, 0, pre_added); 152 return w; 153 } 154 155 // As pointed out by Wei Dai <weidai (at) eskimo.com>, F() below can be simplified 156 // to the code in F_00_19. Wei attributes these optimisations to Peter 157 // Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define 158 // F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another 159 // tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a 160 #define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) 161 #define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) 162 #define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) 163 #define F_60_79(b, c, d) F_20_39(b, c, d) 164 165 // We pre-added the K constants during message scheduling. 166 #define BODY_00_19(i, a, b, c, d, e, f) \ 167 do { \ 168 (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \ 169 (b) = rotate((b), 30); \ 170 } while (0) 171 172 #define BODY_20_39(i, a, b, c, d, e, f) \ 173 do { \ 174 (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \ 175 (b) = rotate((b), 30); \ 176 } while (0) 177 178 #define BODY_40_59(i, a, b, c, d, e, f) \ 179 do { \ 180 (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \ 181 (b) = rotate((b), 30); \ 182 } while (0) 183 184 #define BODY_60_79(i, a, b, c, d, e, f) \ 185 do { \ 186 (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \ 187 (b) = rotate((b), 30); \ 188 } while (0) 189 190 void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) { 191 uint32_t A, B, C, D, E, T; 192 193 A = state[0]; 194 B = state[1]; 195 C = state[2]; 196 D = state[3]; 197 E = state[4]; 198 199 for (;;) { 200 vec_uint32_t vw[20]; 201 const uint32_t *w = (const uint32_t *)&vw; 202 203 vec_uint32_t k = K_00_19_x_4; 204 const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k); 205 BODY_00_19(0, A, B, C, D, E, T); 206 BODY_00_19(1, T, A, B, C, D, E); 207 BODY_00_19(2, E, T, A, B, C, D); 208 BODY_00_19(3, D, E, T, A, B, C); 209 210 const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k); 211 BODY_00_19(4, C, D, E, T, A, B); 212 BODY_00_19(5, B, C, D, E, T, A); 213 BODY_00_19(6, A, B, C, D, E, T); 214 BODY_00_19(7, T, A, B, C, D, E); 215 216 const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k); 217 BODY_00_19(8, E, T, A, B, C, D); 218 BODY_00_19(9, D, E, T, A, B, C); 219 BODY_00_19(10, C, D, E, T, A, B); 220 BODY_00_19(11, B, C, D, E, T, A); 221 222 const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k); 223 BODY_00_19(12, A, B, C, D, E, T); 224 BODY_00_19(13, T, A, B, C, D, E); 225 BODY_00_19(14, E, T, A, B, C, D); 226 BODY_00_19(15, D, E, T, A, B, C); 227 228 const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k); 229 BODY_00_19(16, C, D, E, T, A, B); 230 BODY_00_19(17, B, C, D, E, T, A); 231 BODY_00_19(18, A, B, C, D, E, T); 232 BODY_00_19(19, T, A, B, C, D, E); 233 234 k = K_20_39_x_4; 235 const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k); 236 BODY_20_39(20, E, T, A, B, C, D); 237 BODY_20_39(21, D, E, T, A, B, C); 238 BODY_20_39(22, C, D, E, T, A, B); 239 BODY_20_39(23, B, C, D, E, T, A); 240 241 const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k); 242 BODY_20_39(24, A, B, C, D, E, T); 243 BODY_20_39(25, T, A, B, C, D, E); 244 BODY_20_39(26, E, T, A, B, C, D); 245 BODY_20_39(27, D, E, T, A, B, C); 246 247 const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k); 248 BODY_20_39(28, C, D, E, T, A, B); 249 BODY_20_39(29, B, C, D, E, T, A); 250 BODY_20_39(30, A, B, C, D, E, T); 251 BODY_20_39(31, T, A, B, C, D, E); 252 253 const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k); 254 BODY_20_39(32, E, T, A, B, C, D); 255 BODY_20_39(33, D, E, T, A, B, C); 256 BODY_20_39(34, C, D, E, T, A, B); 257 BODY_20_39(35, B, C, D, E, T, A); 258 259 const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k); 260 BODY_20_39(36, A, B, C, D, E, T); 261 BODY_20_39(37, T, A, B, C, D, E); 262 BODY_20_39(38, E, T, A, B, C, D); 263 BODY_20_39(39, D, E, T, A, B, C); 264 265 k = K_40_59_x_4; 266 const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k); 267 BODY_40_59(40, C, D, E, T, A, B); 268 BODY_40_59(41, B, C, D, E, T, A); 269 BODY_40_59(42, A, B, C, D, E, T); 270 BODY_40_59(43, T, A, B, C, D, E); 271 272 const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k); 273 BODY_40_59(44, E, T, A, B, C, D); 274 BODY_40_59(45, D, E, T, A, B, C); 275 BODY_40_59(46, C, D, E, T, A, B); 276 BODY_40_59(47, B, C, D, E, T, A); 277 278 const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k); 279 BODY_40_59(48, A, B, C, D, E, T); 280 BODY_40_59(49, T, A, B, C, D, E); 281 BODY_40_59(50, E, T, A, B, C, D); 282 BODY_40_59(51, D, E, T, A, B, C); 283 284 const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k); 285 BODY_40_59(52, C, D, E, T, A, B); 286 BODY_40_59(53, B, C, D, E, T, A); 287 BODY_40_59(54, A, B, C, D, E, T); 288 BODY_40_59(55, T, A, B, C, D, E); 289 290 const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k); 291 BODY_40_59(56, E, T, A, B, C, D); 292 BODY_40_59(57, D, E, T, A, B, C); 293 BODY_40_59(58, C, D, E, T, A, B); 294 BODY_40_59(59, B, C, D, E, T, A); 295 296 k = K_60_79_x_4; 297 const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k); 298 BODY_60_79(60, A, B, C, D, E, T); 299 BODY_60_79(61, T, A, B, C, D, E); 300 BODY_60_79(62, E, T, A, B, C, D); 301 BODY_60_79(63, D, E, T, A, B, C); 302 303 const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k); 304 BODY_60_79(64, C, D, E, T, A, B); 305 BODY_60_79(65, B, C, D, E, T, A); 306 BODY_60_79(66, A, B, C, D, E, T); 307 BODY_60_79(67, T, A, B, C, D, E); 308 309 const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k); 310 BODY_60_79(68, E, T, A, B, C, D); 311 BODY_60_79(69, D, E, T, A, B, C); 312 BODY_60_79(70, C, D, E, T, A, B); 313 BODY_60_79(71, B, C, D, E, T, A); 314 315 const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k); 316 BODY_60_79(72, A, B, C, D, E, T); 317 BODY_60_79(73, T, A, B, C, D, E); 318 BODY_60_79(74, E, T, A, B, C, D); 319 BODY_60_79(75, D, E, T, A, B, C); 320 321 // We don't use the last value 322 (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k); 323 BODY_60_79(76, C, D, E, T, A, B); 324 BODY_60_79(77, B, C, D, E, T, A); 325 BODY_60_79(78, A, B, C, D, E, T); 326 BODY_60_79(79, T, A, B, C, D, E); 327 328 const uint32_t mask = 0xffffffffUL; 329 state[0] = (state[0] + E) & mask; 330 state[1] = (state[1] + T) & mask; 331 state[2] = (state[2] + A) & mask; 332 state[3] = (state[3] + B) & mask; 333 state[4] = (state[4] + C) & mask; 334 335 data += 64; 336 if (--num == 0) { 337 break; 338 } 339 340 A = state[0]; 341 B = state[1]; 342 C = state[2]; 343 D = state[3]; 344 E = state[4]; 345 } 346 } 347 348 #endif // OPENSSL_PPC64LE 349 350 #undef K_00_19 351 #undef K_20_39 352 #undef K_40_59 353 #undef K_60_79 354 #undef F_00_19 355 #undef F_20_39 356 #undef F_40_59 357 #undef F_60_79 358 #undef BODY_00_19 359 #undef BODY_20_39 360 #undef BODY_40_59 361 #undef BODY_60_79 362