1 /* Copyright (c) 2014, Google Inc. 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14 15 // This implementation of poly1305 is by Andrew Moon 16 // (https://github.com/floodyberry/poly1305-donna) and released as public 17 // domain. It implements SIMD vectorization based on the algorithm described in 18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte 19 // block size 20 21 #include <openssl/poly1305.h> 22 23 #include "../internal.h" 24 25 26 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64) 27 28 #include <emmintrin.h> 29 30 #define U8TO64_LE(m) (*(const uint64_t *)(m)) 31 #define U8TO32_LE(m) (*(const uint32_t *)(m)) 32 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v 33 34 typedef __m128i xmmi; 35 36 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = { 37 (1 << 26) - 1, 0, (1 << 26) - 1, 0}; 38 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; 39 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = { 40 (1 << 24), 0, (1 << 24), 0}; 41 42 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; } 43 44 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; } 45 46 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) { 47 return (uint128_t)a * b; 48 } 49 50 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; } 51 52 static inline uint64_t shr128(uint128_t v, const int shift) { 53 return (uint64_t)(v >> shift); 54 } 55 56 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) { 57 return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); 58 } 59 60 typedef struct poly1305_power_t { 61 union { 62 xmmi v; 63 uint64_t u[2]; 64 uint32_t d[4]; 65 } R20, R21, R22, R23, R24, S21, S22, S23, S24; 66 } poly1305_power; 67 68 typedef struct poly1305_state_internal_t { 69 poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 70 bytes of free storage */ 71 union { 72 xmmi H[5]; // 80 bytes 73 uint64_t HH[10]; 74 }; 75 // uint64_t r0,r1,r2; [24 bytes] 76 // uint64_t pad0,pad1; [16 bytes] 77 uint64_t started; // 8 bytes 78 uint64_t leftover; // 8 bytes 79 uint8_t buffer[64]; // 64 bytes 80 } poly1305_state_internal; /* 448 bytes total + 63 bytes for 81 alignment = 511 bytes raw */ 82 83 static inline poly1305_state_internal *poly1305_aligned_state( 84 poly1305_state *state) { 85 return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); 86 } 87 88 static inline size_t poly1305_min(size_t a, size_t b) { 89 return (a < b) ? a : b; 90 } 91 92 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { 93 poly1305_state_internal *st = poly1305_aligned_state(state); 94 poly1305_power *p; 95 uint64_t r0, r1, r2; 96 uint64_t t0, t1; 97 98 // clamp key 99 t0 = U8TO64_LE(key + 0); 100 t1 = U8TO64_LE(key + 8); 101 r0 = t0 & 0xffc0fffffff; 102 t0 >>= 44; 103 t0 |= t1 << 20; 104 r1 = t0 & 0xfffffc0ffff; 105 t1 >>= 24; 106 r2 = t1 & 0x00ffffffc0f; 107 108 // store r in un-used space of st->P[1] 109 p = &st->P[1]; 110 p->R20.d[1] = (uint32_t)(r0); 111 p->R20.d[3] = (uint32_t)(r0 >> 32); 112 p->R21.d[1] = (uint32_t)(r1); 113 p->R21.d[3] = (uint32_t)(r1 >> 32); 114 p->R22.d[1] = (uint32_t)(r2); 115 p->R22.d[3] = (uint32_t)(r2 >> 32); 116 117 // store pad 118 p->R23.d[1] = U8TO32_LE(key + 16); 119 p->R23.d[3] = U8TO32_LE(key + 20); 120 p->R24.d[1] = U8TO32_LE(key + 24); 121 p->R24.d[3] = U8TO32_LE(key + 28); 122 123 // H = 0 124 st->H[0] = _mm_setzero_si128(); 125 st->H[1] = _mm_setzero_si128(); 126 st->H[2] = _mm_setzero_si128(); 127 st->H[3] = _mm_setzero_si128(); 128 st->H[4] = _mm_setzero_si128(); 129 130 st->started = 0; 131 st->leftover = 0; 132 } 133 134 static void poly1305_first_block(poly1305_state_internal *st, 135 const uint8_t *m) { 136 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); 137 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); 138 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); 139 xmmi T5, T6; 140 poly1305_power *p; 141 uint128_t d[3]; 142 uint64_t r0, r1, r2; 143 uint64_t r20, r21, r22, s22; 144 uint64_t pad0, pad1; 145 uint64_t c; 146 uint64_t i; 147 148 // pull out stored info 149 p = &st->P[1]; 150 151 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 152 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 153 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 154 pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; 155 pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; 156 157 // compute powers r^2,r^4 158 r20 = r0; 159 r21 = r1; 160 r22 = r2; 161 for (i = 0; i < 2; i++) { 162 s22 = r22 * (5 << 2); 163 164 d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); 165 d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); 166 d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); 167 168 r20 = lo128(d[0]) & 0xfffffffffff; 169 c = shr128(d[0], 44); 170 d[1] = add128_64(d[1], c); 171 r21 = lo128(d[1]) & 0xfffffffffff; 172 c = shr128(d[1], 44); 173 d[2] = add128_64(d[2], c); 174 r22 = lo128(d[2]) & 0x3ffffffffff; 175 c = shr128(d[2], 42); 176 r20 += c * 5; 177 c = (r20 >> 44); 178 r20 = r20 & 0xfffffffffff; 179 r21 += c; 180 181 p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), 182 _MM_SHUFFLE(1, 0, 1, 0)); 183 p->R21.v = _mm_shuffle_epi32( 184 _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), 185 _MM_SHUFFLE(1, 0, 1, 0)); 186 p->R22.v = 187 _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), 188 _MM_SHUFFLE(1, 0, 1, 0)); 189 p->R23.v = _mm_shuffle_epi32( 190 _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), 191 _MM_SHUFFLE(1, 0, 1, 0)); 192 p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), 193 _MM_SHUFFLE(1, 0, 1, 0)); 194 p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); 195 p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); 196 p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); 197 p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); 198 p--; 199 } 200 201 // put saved info back 202 p = &st->P[1]; 203 p->R20.d[1] = (uint32_t)(r0); 204 p->R20.d[3] = (uint32_t)(r0 >> 32); 205 p->R21.d[1] = (uint32_t)(r1); 206 p->R21.d[3] = (uint32_t)(r1 >> 32); 207 p->R22.d[1] = (uint32_t)(r2); 208 p->R22.d[3] = (uint32_t)(r2 >> 32); 209 p->R23.d[1] = (uint32_t)(pad0); 210 p->R23.d[3] = (uint32_t)(pad0 >> 32); 211 p->R24.d[1] = (uint32_t)(pad1); 212 p->R24.d[3] = (uint32_t)(pad1 >> 32); 213 214 // H = [Mx,My] 215 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), 216 _mm_loadl_epi64((const xmmi *)(m + 16))); 217 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), 218 _mm_loadl_epi64((const xmmi *)(m + 24))); 219 st->H[0] = _mm_and_si128(MMASK, T5); 220 st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 221 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 222 st->H[2] = _mm_and_si128(MMASK, T5); 223 st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 224 st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 225 } 226 227 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, 228 size_t bytes) { 229 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); 230 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); 231 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); 232 233 poly1305_power *p; 234 xmmi H0, H1, H2, H3, H4; 235 xmmi T0, T1, T2, T3, T4, T5, T6; 236 xmmi M0, M1, M2, M3, M4; 237 xmmi C1, C2; 238 239 H0 = st->H[0]; 240 H1 = st->H[1]; 241 H2 = st->H[2]; 242 H3 = st->H[3]; 243 H4 = st->H[4]; 244 245 while (bytes >= 64) { 246 // H *= [r^4,r^4] 247 p = &st->P[0]; 248 T0 = _mm_mul_epu32(H0, p->R20.v); 249 T1 = _mm_mul_epu32(H0, p->R21.v); 250 T2 = _mm_mul_epu32(H0, p->R22.v); 251 T3 = _mm_mul_epu32(H0, p->R23.v); 252 T4 = _mm_mul_epu32(H0, p->R24.v); 253 T5 = _mm_mul_epu32(H1, p->S24.v); 254 T6 = _mm_mul_epu32(H1, p->R20.v); 255 T0 = _mm_add_epi64(T0, T5); 256 T1 = _mm_add_epi64(T1, T6); 257 T5 = _mm_mul_epu32(H2, p->S23.v); 258 T6 = _mm_mul_epu32(H2, p->S24.v); 259 T0 = _mm_add_epi64(T0, T5); 260 T1 = _mm_add_epi64(T1, T6); 261 T5 = _mm_mul_epu32(H3, p->S22.v); 262 T6 = _mm_mul_epu32(H3, p->S23.v); 263 T0 = _mm_add_epi64(T0, T5); 264 T1 = _mm_add_epi64(T1, T6); 265 T5 = _mm_mul_epu32(H4, p->S21.v); 266 T6 = _mm_mul_epu32(H4, p->S22.v); 267 T0 = _mm_add_epi64(T0, T5); 268 T1 = _mm_add_epi64(T1, T6); 269 T5 = _mm_mul_epu32(H1, p->R21.v); 270 T6 = _mm_mul_epu32(H1, p->R22.v); 271 T2 = _mm_add_epi64(T2, T5); 272 T3 = _mm_add_epi64(T3, T6); 273 T5 = _mm_mul_epu32(H2, p->R20.v); 274 T6 = _mm_mul_epu32(H2, p->R21.v); 275 T2 = _mm_add_epi64(T2, T5); 276 T3 = _mm_add_epi64(T3, T6); 277 T5 = _mm_mul_epu32(H3, p->S24.v); 278 T6 = _mm_mul_epu32(H3, p->R20.v); 279 T2 = _mm_add_epi64(T2, T5); 280 T3 = _mm_add_epi64(T3, T6); 281 T5 = _mm_mul_epu32(H4, p->S23.v); 282 T6 = _mm_mul_epu32(H4, p->S24.v); 283 T2 = _mm_add_epi64(T2, T5); 284 T3 = _mm_add_epi64(T3, T6); 285 T5 = _mm_mul_epu32(H1, p->R23.v); 286 T4 = _mm_add_epi64(T4, T5); 287 T5 = _mm_mul_epu32(H2, p->R22.v); 288 T4 = _mm_add_epi64(T4, T5); 289 T5 = _mm_mul_epu32(H3, p->R21.v); 290 T4 = _mm_add_epi64(T4, T5); 291 T5 = _mm_mul_epu32(H4, p->R20.v); 292 T4 = _mm_add_epi64(T4, T5); 293 294 // H += [Mx,My]*[r^2,r^2] 295 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), 296 _mm_loadl_epi64((const xmmi *)(m + 16))); 297 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), 298 _mm_loadl_epi64((const xmmi *)(m + 24))); 299 M0 = _mm_and_si128(MMASK, T5); 300 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 301 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 302 M2 = _mm_and_si128(MMASK, T5); 303 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 304 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 305 306 p = &st->P[1]; 307 T5 = _mm_mul_epu32(M0, p->R20.v); 308 T6 = _mm_mul_epu32(M0, p->R21.v); 309 T0 = _mm_add_epi64(T0, T5); 310 T1 = _mm_add_epi64(T1, T6); 311 T5 = _mm_mul_epu32(M1, p->S24.v); 312 T6 = _mm_mul_epu32(M1, p->R20.v); 313 T0 = _mm_add_epi64(T0, T5); 314 T1 = _mm_add_epi64(T1, T6); 315 T5 = _mm_mul_epu32(M2, p->S23.v); 316 T6 = _mm_mul_epu32(M2, p->S24.v); 317 T0 = _mm_add_epi64(T0, T5); 318 T1 = _mm_add_epi64(T1, T6); 319 T5 = _mm_mul_epu32(M3, p->S22.v); 320 T6 = _mm_mul_epu32(M3, p->S23.v); 321 T0 = _mm_add_epi64(T0, T5); 322 T1 = _mm_add_epi64(T1, T6); 323 T5 = _mm_mul_epu32(M4, p->S21.v); 324 T6 = _mm_mul_epu32(M4, p->S22.v); 325 T0 = _mm_add_epi64(T0, T5); 326 T1 = _mm_add_epi64(T1, T6); 327 T5 = _mm_mul_epu32(M0, p->R22.v); 328 T6 = _mm_mul_epu32(M0, p->R23.v); 329 T2 = _mm_add_epi64(T2, T5); 330 T3 = _mm_add_epi64(T3, T6); 331 T5 = _mm_mul_epu32(M1, p->R21.v); 332 T6 = _mm_mul_epu32(M1, p->R22.v); 333 T2 = _mm_add_epi64(T2, T5); 334 T3 = _mm_add_epi64(T3, T6); 335 T5 = _mm_mul_epu32(M2, p->R20.v); 336 T6 = _mm_mul_epu32(M2, p->R21.v); 337 T2 = _mm_add_epi64(T2, T5); 338 T3 = _mm_add_epi64(T3, T6); 339 T5 = _mm_mul_epu32(M3, p->S24.v); 340 T6 = _mm_mul_epu32(M3, p->R20.v); 341 T2 = _mm_add_epi64(T2, T5); 342 T3 = _mm_add_epi64(T3, T6); 343 T5 = _mm_mul_epu32(M4, p->S23.v); 344 T6 = _mm_mul_epu32(M4, p->S24.v); 345 T2 = _mm_add_epi64(T2, T5); 346 T3 = _mm_add_epi64(T3, T6); 347 T5 = _mm_mul_epu32(M0, p->R24.v); 348 T4 = _mm_add_epi64(T4, T5); 349 T5 = _mm_mul_epu32(M1, p->R23.v); 350 T4 = _mm_add_epi64(T4, T5); 351 T5 = _mm_mul_epu32(M2, p->R22.v); 352 T4 = _mm_add_epi64(T4, T5); 353 T5 = _mm_mul_epu32(M3, p->R21.v); 354 T4 = _mm_add_epi64(T4, T5); 355 T5 = _mm_mul_epu32(M4, p->R20.v); 356 T4 = _mm_add_epi64(T4, T5); 357 358 // H += [Mx,My] 359 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), 360 _mm_loadl_epi64((const xmmi *)(m + 48))); 361 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), 362 _mm_loadl_epi64((const xmmi *)(m + 56))); 363 M0 = _mm_and_si128(MMASK, T5); 364 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 365 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 366 M2 = _mm_and_si128(MMASK, T5); 367 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 368 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 369 370 T0 = _mm_add_epi64(T0, M0); 371 T1 = _mm_add_epi64(T1, M1); 372 T2 = _mm_add_epi64(T2, M2); 373 T3 = _mm_add_epi64(T3, M3); 374 T4 = _mm_add_epi64(T4, M4); 375 376 // reduce 377 C1 = _mm_srli_epi64(T0, 26); 378 C2 = _mm_srli_epi64(T3, 26); 379 T0 = _mm_and_si128(T0, MMASK); 380 T3 = _mm_and_si128(T3, MMASK); 381 T1 = _mm_add_epi64(T1, C1); 382 T4 = _mm_add_epi64(T4, C2); 383 C1 = _mm_srli_epi64(T1, 26); 384 C2 = _mm_srli_epi64(T4, 26); 385 T1 = _mm_and_si128(T1, MMASK); 386 T4 = _mm_and_si128(T4, MMASK); 387 T2 = _mm_add_epi64(T2, C1); 388 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 389 C1 = _mm_srli_epi64(T2, 26); 390 C2 = _mm_srli_epi64(T0, 26); 391 T2 = _mm_and_si128(T2, MMASK); 392 T0 = _mm_and_si128(T0, MMASK); 393 T3 = _mm_add_epi64(T3, C1); 394 T1 = _mm_add_epi64(T1, C2); 395 C1 = _mm_srli_epi64(T3, 26); 396 T3 = _mm_and_si128(T3, MMASK); 397 T4 = _mm_add_epi64(T4, C1); 398 399 // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) 400 H0 = T0; 401 H1 = T1; 402 H2 = T2; 403 H3 = T3; 404 H4 = T4; 405 406 m += 64; 407 bytes -= 64; 408 } 409 410 st->H[0] = H0; 411 st->H[1] = H1; 412 st->H[2] = H2; 413 st->H[3] = H3; 414 st->H[4] = H4; 415 } 416 417 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, 418 size_t bytes) { 419 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); 420 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); 421 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); 422 423 poly1305_power *p; 424 xmmi H0, H1, H2, H3, H4; 425 xmmi M0, M1, M2, M3, M4; 426 xmmi T0, T1, T2, T3, T4, T5, T6; 427 xmmi C1, C2; 428 429 uint64_t r0, r1, r2; 430 uint64_t t0, t1, t2, t3, t4; 431 uint64_t c; 432 size_t consumed = 0; 433 434 H0 = st->H[0]; 435 H1 = st->H[1]; 436 H2 = st->H[2]; 437 H3 = st->H[3]; 438 H4 = st->H[4]; 439 440 // p = [r^2,r^2] 441 p = &st->P[1]; 442 443 if (bytes >= 32) { 444 // H *= [r^2,r^2] 445 T0 = _mm_mul_epu32(H0, p->R20.v); 446 T1 = _mm_mul_epu32(H0, p->R21.v); 447 T2 = _mm_mul_epu32(H0, p->R22.v); 448 T3 = _mm_mul_epu32(H0, p->R23.v); 449 T4 = _mm_mul_epu32(H0, p->R24.v); 450 T5 = _mm_mul_epu32(H1, p->S24.v); 451 T6 = _mm_mul_epu32(H1, p->R20.v); 452 T0 = _mm_add_epi64(T0, T5); 453 T1 = _mm_add_epi64(T1, T6); 454 T5 = _mm_mul_epu32(H2, p->S23.v); 455 T6 = _mm_mul_epu32(H2, p->S24.v); 456 T0 = _mm_add_epi64(T0, T5); 457 T1 = _mm_add_epi64(T1, T6); 458 T5 = _mm_mul_epu32(H3, p->S22.v); 459 T6 = _mm_mul_epu32(H3, p->S23.v); 460 T0 = _mm_add_epi64(T0, T5); 461 T1 = _mm_add_epi64(T1, T6); 462 T5 = _mm_mul_epu32(H4, p->S21.v); 463 T6 = _mm_mul_epu32(H4, p->S22.v); 464 T0 = _mm_add_epi64(T0, T5); 465 T1 = _mm_add_epi64(T1, T6); 466 T5 = _mm_mul_epu32(H1, p->R21.v); 467 T6 = _mm_mul_epu32(H1, p->R22.v); 468 T2 = _mm_add_epi64(T2, T5); 469 T3 = _mm_add_epi64(T3, T6); 470 T5 = _mm_mul_epu32(H2, p->R20.v); 471 T6 = _mm_mul_epu32(H2, p->R21.v); 472 T2 = _mm_add_epi64(T2, T5); 473 T3 = _mm_add_epi64(T3, T6); 474 T5 = _mm_mul_epu32(H3, p->S24.v); 475 T6 = _mm_mul_epu32(H3, p->R20.v); 476 T2 = _mm_add_epi64(T2, T5); 477 T3 = _mm_add_epi64(T3, T6); 478 T5 = _mm_mul_epu32(H4, p->S23.v); 479 T6 = _mm_mul_epu32(H4, p->S24.v); 480 T2 = _mm_add_epi64(T2, T5); 481 T3 = _mm_add_epi64(T3, T6); 482 T5 = _mm_mul_epu32(H1, p->R23.v); 483 T4 = _mm_add_epi64(T4, T5); 484 T5 = _mm_mul_epu32(H2, p->R22.v); 485 T4 = _mm_add_epi64(T4, T5); 486 T5 = _mm_mul_epu32(H3, p->R21.v); 487 T4 = _mm_add_epi64(T4, T5); 488 T5 = _mm_mul_epu32(H4, p->R20.v); 489 T4 = _mm_add_epi64(T4, T5); 490 491 // H += [Mx,My] 492 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), 493 _mm_loadl_epi64((const xmmi *)(m + 16))); 494 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), 495 _mm_loadl_epi64((const xmmi *)(m + 24))); 496 M0 = _mm_and_si128(MMASK, T5); 497 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 498 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 499 M2 = _mm_and_si128(MMASK, T5); 500 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 501 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 502 503 T0 = _mm_add_epi64(T0, M0); 504 T1 = _mm_add_epi64(T1, M1); 505 T2 = _mm_add_epi64(T2, M2); 506 T3 = _mm_add_epi64(T3, M3); 507 T4 = _mm_add_epi64(T4, M4); 508 509 // reduce 510 C1 = _mm_srli_epi64(T0, 26); 511 C2 = _mm_srli_epi64(T3, 26); 512 T0 = _mm_and_si128(T0, MMASK); 513 T3 = _mm_and_si128(T3, MMASK); 514 T1 = _mm_add_epi64(T1, C1); 515 T4 = _mm_add_epi64(T4, C2); 516 C1 = _mm_srli_epi64(T1, 26); 517 C2 = _mm_srli_epi64(T4, 26); 518 T1 = _mm_and_si128(T1, MMASK); 519 T4 = _mm_and_si128(T4, MMASK); 520 T2 = _mm_add_epi64(T2, C1); 521 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 522 C1 = _mm_srli_epi64(T2, 26); 523 C2 = _mm_srli_epi64(T0, 26); 524 T2 = _mm_and_si128(T2, MMASK); 525 T0 = _mm_and_si128(T0, MMASK); 526 T3 = _mm_add_epi64(T3, C1); 527 T1 = _mm_add_epi64(T1, C2); 528 C1 = _mm_srli_epi64(T3, 26); 529 T3 = _mm_and_si128(T3, MMASK); 530 T4 = _mm_add_epi64(T4, C1); 531 532 // H = (H*[r^2,r^2] + [Mx,My]) 533 H0 = T0; 534 H1 = T1; 535 H2 = T2; 536 H3 = T3; 537 H4 = T4; 538 539 consumed = 32; 540 } 541 542 // finalize, H *= [r^2,r] 543 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 544 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 545 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 546 547 p->R20.d[2] = (uint32_t)(r0)&0x3ffffff; 548 p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; 549 p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; 550 p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; 551 p->R24.d[2] = (uint32_t)((r2 >> 16)); 552 p->S21.d[2] = p->R21.d[2] * 5; 553 p->S22.d[2] = p->R22.d[2] * 5; 554 p->S23.d[2] = p->R23.d[2] * 5; 555 p->S24.d[2] = p->R24.d[2] * 5; 556 557 // H *= [r^2,r] 558 T0 = _mm_mul_epu32(H0, p->R20.v); 559 T1 = _mm_mul_epu32(H0, p->R21.v); 560 T2 = _mm_mul_epu32(H0, p->R22.v); 561 T3 = _mm_mul_epu32(H0, p->R23.v); 562 T4 = _mm_mul_epu32(H0, p->R24.v); 563 T5 = _mm_mul_epu32(H1, p->S24.v); 564 T6 = _mm_mul_epu32(H1, p->R20.v); 565 T0 = _mm_add_epi64(T0, T5); 566 T1 = _mm_add_epi64(T1, T6); 567 T5 = _mm_mul_epu32(H2, p->S23.v); 568 T6 = _mm_mul_epu32(H2, p->S24.v); 569 T0 = _mm_add_epi64(T0, T5); 570 T1 = _mm_add_epi64(T1, T6); 571 T5 = _mm_mul_epu32(H3, p->S22.v); 572 T6 = _mm_mul_epu32(H3, p->S23.v); 573 T0 = _mm_add_epi64(T0, T5); 574 T1 = _mm_add_epi64(T1, T6); 575 T5 = _mm_mul_epu32(H4, p->S21.v); 576 T6 = _mm_mul_epu32(H4, p->S22.v); 577 T0 = _mm_add_epi64(T0, T5); 578 T1 = _mm_add_epi64(T1, T6); 579 T5 = _mm_mul_epu32(H1, p->R21.v); 580 T6 = _mm_mul_epu32(H1, p->R22.v); 581 T2 = _mm_add_epi64(T2, T5); 582 T3 = _mm_add_epi64(T3, T6); 583 T5 = _mm_mul_epu32(H2, p->R20.v); 584 T6 = _mm_mul_epu32(H2, p->R21.v); 585 T2 = _mm_add_epi64(T2, T5); 586 T3 = _mm_add_epi64(T3, T6); 587 T5 = _mm_mul_epu32(H3, p->S24.v); 588 T6 = _mm_mul_epu32(H3, p->R20.v); 589 T2 = _mm_add_epi64(T2, T5); 590 T3 = _mm_add_epi64(T3, T6); 591 T5 = _mm_mul_epu32(H4, p->S23.v); 592 T6 = _mm_mul_epu32(H4, p->S24.v); 593 T2 = _mm_add_epi64(T2, T5); 594 T3 = _mm_add_epi64(T3, T6); 595 T5 = _mm_mul_epu32(H1, p->R23.v); 596 T4 = _mm_add_epi64(T4, T5); 597 T5 = _mm_mul_epu32(H2, p->R22.v); 598 T4 = _mm_add_epi64(T4, T5); 599 T5 = _mm_mul_epu32(H3, p->R21.v); 600 T4 = _mm_add_epi64(T4, T5); 601 T5 = _mm_mul_epu32(H4, p->R20.v); 602 T4 = _mm_add_epi64(T4, T5); 603 604 C1 = _mm_srli_epi64(T0, 26); 605 C2 = _mm_srli_epi64(T3, 26); 606 T0 = _mm_and_si128(T0, MMASK); 607 T3 = _mm_and_si128(T3, MMASK); 608 T1 = _mm_add_epi64(T1, C1); 609 T4 = _mm_add_epi64(T4, C2); 610 C1 = _mm_srli_epi64(T1, 26); 611 C2 = _mm_srli_epi64(T4, 26); 612 T1 = _mm_and_si128(T1, MMASK); 613 T4 = _mm_and_si128(T4, MMASK); 614 T2 = _mm_add_epi64(T2, C1); 615 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 616 C1 = _mm_srli_epi64(T2, 26); 617 C2 = _mm_srli_epi64(T0, 26); 618 T2 = _mm_and_si128(T2, MMASK); 619 T0 = _mm_and_si128(T0, MMASK); 620 T3 = _mm_add_epi64(T3, C1); 621 T1 = _mm_add_epi64(T1, C2); 622 C1 = _mm_srli_epi64(T3, 26); 623 T3 = _mm_and_si128(T3, MMASK); 624 T4 = _mm_add_epi64(T4, C1); 625 626 // H = H[0]+H[1] 627 H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); 628 H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); 629 H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); 630 H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); 631 H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); 632 633 t0 = _mm_cvtsi128_si32(H0); 634 c = (t0 >> 26); 635 t0 &= 0x3ffffff; 636 t1 = _mm_cvtsi128_si32(H1) + c; 637 c = (t1 >> 26); 638 t1 &= 0x3ffffff; 639 t2 = _mm_cvtsi128_si32(H2) + c; 640 c = (t2 >> 26); 641 t2 &= 0x3ffffff; 642 t3 = _mm_cvtsi128_si32(H3) + c; 643 c = (t3 >> 26); 644 t3 &= 0x3ffffff; 645 t4 = _mm_cvtsi128_si32(H4) + c; 646 c = (t4 >> 26); 647 t4 &= 0x3ffffff; 648 t0 = t0 + (c * 5); 649 c = (t0 >> 26); 650 t0 &= 0x3ffffff; 651 t1 = t1 + c; 652 653 st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff); 654 st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff); 655 st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff); 656 657 return consumed; 658 } 659 660 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, 661 size_t bytes) { 662 poly1305_state_internal *st = poly1305_aligned_state(state); 663 size_t want; 664 665 // need at least 32 initial bytes to start the accelerated branch 666 if (!st->started) { 667 if ((st->leftover == 0) && (bytes > 32)) { 668 poly1305_first_block(st, m); 669 m += 32; 670 bytes -= 32; 671 } else { 672 want = poly1305_min(32 - st->leftover, bytes); 673 OPENSSL_memcpy(st->buffer + st->leftover, m, want); 674 bytes -= want; 675 m += want; 676 st->leftover += want; 677 if ((st->leftover < 32) || (bytes == 0)) { 678 return; 679 } 680 poly1305_first_block(st, st->buffer); 681 st->leftover = 0; 682 } 683 st->started = 1; 684 } 685 686 // handle leftover 687 if (st->leftover) { 688 want = poly1305_min(64 - st->leftover, bytes); 689 OPENSSL_memcpy(st->buffer + st->leftover, m, want); 690 bytes -= want; 691 m += want; 692 st->leftover += want; 693 if (st->leftover < 64) { 694 return; 695 } 696 poly1305_blocks(st, st->buffer, 64); 697 st->leftover = 0; 698 } 699 700 // process 64 byte blocks 701 if (bytes >= 64) { 702 want = (bytes & ~63); 703 poly1305_blocks(st, m, want); 704 m += want; 705 bytes -= want; 706 } 707 708 if (bytes) { 709 OPENSSL_memcpy(st->buffer + st->leftover, m, bytes); 710 st->leftover += bytes; 711 } 712 } 713 714 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { 715 poly1305_state_internal *st = poly1305_aligned_state(state); 716 size_t leftover = st->leftover; 717 uint8_t *m = st->buffer; 718 uint128_t d[3]; 719 uint64_t h0, h1, h2; 720 uint64_t t0, t1; 721 uint64_t g0, g1, g2, c, nc; 722 uint64_t r0, r1, r2, s1, s2; 723 poly1305_power *p; 724 725 if (st->started) { 726 size_t consumed = poly1305_combine(st, m, leftover); 727 leftover -= consumed; 728 m += consumed; 729 } 730 731 // st->HH will either be 0 or have the combined result 732 h0 = st->HH[0]; 733 h1 = st->HH[1]; 734 h2 = st->HH[2]; 735 736 p = &st->P[1]; 737 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 738 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 739 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 740 s1 = r1 * (5 << 2); 741 s2 = r2 * (5 << 2); 742 743 if (leftover < 16) { 744 goto poly1305_donna_atmost15bytes; 745 } 746 747 poly1305_donna_atleast16bytes: 748 t0 = U8TO64_LE(m + 0); 749 t1 = U8TO64_LE(m + 8); 750 h0 += t0 & 0xfffffffffff; 751 t0 = shr128_pair(t1, t0, 44); 752 h1 += t0 & 0xfffffffffff; 753 h2 += (t1 >> 24) | ((uint64_t)1 << 40); 754 755 poly1305_donna_mul: 756 d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), 757 mul64x64_128(h2, s1)); 758 d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), 759 mul64x64_128(h2, s2)); 760 d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), 761 mul64x64_128(h2, r0)); 762 h0 = lo128(d[0]) & 0xfffffffffff; 763 c = shr128(d[0], 44); 764 d[1] = add128_64(d[1], c); 765 h1 = lo128(d[1]) & 0xfffffffffff; 766 c = shr128(d[1], 44); 767 d[2] = add128_64(d[2], c); 768 h2 = lo128(d[2]) & 0x3ffffffffff; 769 c = shr128(d[2], 42); 770 h0 += c * 5; 771 772 m += 16; 773 leftover -= 16; 774 if (leftover >= 16) { 775 goto poly1305_donna_atleast16bytes; 776 } 777 778 // final bytes 779 poly1305_donna_atmost15bytes: 780 if (!leftover) { 781 goto poly1305_donna_finish; 782 } 783 784 m[leftover++] = 1; 785 OPENSSL_memset(m + leftover, 0, 16 - leftover); 786 leftover = 16; 787 788 t0 = U8TO64_LE(m + 0); 789 t1 = U8TO64_LE(m + 8); 790 h0 += t0 & 0xfffffffffff; 791 t0 = shr128_pair(t1, t0, 44); 792 h1 += t0 & 0xfffffffffff; 793 h2 += (t1 >> 24); 794 795 goto poly1305_donna_mul; 796 797 poly1305_donna_finish: 798 c = (h0 >> 44); 799 h0 &= 0xfffffffffff; 800 h1 += c; 801 c = (h1 >> 44); 802 h1 &= 0xfffffffffff; 803 h2 += c; 804 c = (h2 >> 42); 805 h2 &= 0x3ffffffffff; 806 h0 += c * 5; 807 808 g0 = h0 + 5; 809 c = (g0 >> 44); 810 g0 &= 0xfffffffffff; 811 g1 = h1 + c; 812 c = (g1 >> 44); 813 g1 &= 0xfffffffffff; 814 g2 = h2 + c - ((uint64_t)1 << 42); 815 816 c = (g2 >> 63) - 1; 817 nc = ~c; 818 h0 = (h0 & nc) | (g0 & c); 819 h1 = (h1 & nc) | (g1 & c); 820 h2 = (h2 & nc) | (g2 & c); 821 822 // pad 823 t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; 824 t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; 825 h0 += (t0 & 0xfffffffffff); 826 c = (h0 >> 44); 827 h0 &= 0xfffffffffff; 828 t0 = shr128_pair(t1, t0, 44); 829 h1 += (t0 & 0xfffffffffff) + c; 830 c = (h1 >> 44); 831 h1 &= 0xfffffffffff; 832 t1 = (t1 >> 24); 833 h2 += (t1)+c; 834 835 U64TO8_LE(mac + 0, ((h0) | (h1 << 44))); 836 U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24))); 837 } 838 839 #endif // !OPENSSL_WINDOWS && OPENSSL_X86_64 840