1 /* Copyright (c) 2014, Google Inc. 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14 15 /* This implementation of poly1305 is by Andrew Moon 16 * (https://github.com/floodyberry/poly1305-donna) and released as public 17 * domain. It implements SIMD vectorization based on the algorithm described in 18 * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte 19 * block size */ 20 21 #include <openssl/poly1305.h> 22 23 24 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64) 25 26 #include <emmintrin.h> 27 28 #define ALIGN(x) __attribute__((aligned(x))) 29 /* inline is not a keyword in C89. */ 30 #define INLINE 31 #define U8TO64_LE(m) (*(uint64_t *)(m)) 32 #define U8TO32_LE(m) (*(uint32_t *)(m)) 33 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v 34 35 typedef __m128i xmmi; 36 typedef unsigned __int128 uint128_t; 37 38 static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = { 39 (1 << 26) - 1, 0, (1 << 26) - 1, 0}; 40 static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; 41 static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0, 42 (1 << 24), 0}; 43 44 static uint128_t INLINE add128(uint128_t a, uint128_t b) { return a + b; } 45 46 static uint128_t INLINE add128_64(uint128_t a, uint64_t b) { return a + b; } 47 48 static uint128_t INLINE mul64x64_128(uint64_t a, uint64_t b) { 49 return (uint128_t)a * b; 50 } 51 52 static uint64_t INLINE lo128(uint128_t a) { return (uint64_t)a; } 53 54 static uint64_t INLINE shr128(uint128_t v, const int shift) { 55 return (uint64_t)(v >> shift); 56 } 57 58 static uint64_t INLINE shr128_pair(uint64_t hi, uint64_t lo, const int shift) { 59 return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); 60 } 61 62 typedef struct poly1305_power_t { 63 union { 64 xmmi v; 65 uint64_t u[2]; 66 uint32_t d[4]; 67 } R20, R21, R22, R23, R24, S21, S22, S23, S24; 68 } poly1305_power; 69 70 typedef struct poly1305_state_internal_t { 71 poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 72 bytes of free storage */ 73 union { 74 xmmi H[5]; /* 80 bytes */ 75 uint64_t HH[10]; 76 }; 77 /* uint64_t r0,r1,r2; [24 bytes] */ 78 /* uint64_t pad0,pad1; [16 bytes] */ 79 uint64_t started; /* 8 bytes */ 80 uint64_t leftover; /* 8 bytes */ 81 uint8_t buffer[64]; /* 64 bytes */ 82 } poly1305_state_internal; /* 448 bytes total + 63 bytes for 83 alignment = 511 bytes raw */ 84 85 static poly1305_state_internal INLINE *poly1305_aligned_state( 86 poly1305_state *state) { 87 return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); 88 } 89 90 /* copy 0-63 bytes */ 91 static void INLINE 92 poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) { 93 size_t offset = src - dst; 94 if (bytes & 32) { 95 _mm_storeu_si128((xmmi *)(dst + 0), 96 _mm_loadu_si128((xmmi *)(dst + offset + 0))); 97 _mm_storeu_si128((xmmi *)(dst + 16), 98 _mm_loadu_si128((xmmi *)(dst + offset + 16))); 99 dst += 32; 100 } 101 if (bytes & 16) { 102 _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); 103 dst += 16; 104 } 105 if (bytes & 8) { 106 *(uint64_t *)dst = *(uint64_t *)(dst + offset); 107 dst += 8; 108 } 109 if (bytes & 4) { 110 *(uint32_t *)dst = *(uint32_t *)(dst + offset); 111 dst += 4; 112 } 113 if (bytes & 2) { 114 *(uint16_t *)dst = *(uint16_t *)(dst + offset); 115 dst += 2; 116 } 117 if (bytes & 1) { 118 *(uint8_t *)dst = *(uint8_t *)(dst + offset); 119 } 120 } 121 122 /* zero 0-15 bytes */ 123 static void INLINE poly1305_block_zero(uint8_t *dst, size_t bytes) { 124 if (bytes & 8) { 125 *(uint64_t *)dst = 0; 126 dst += 8; 127 } 128 if (bytes & 4) { 129 *(uint32_t *)dst = 0; 130 dst += 4; 131 } 132 if (bytes & 2) { 133 *(uint16_t *)dst = 0; 134 dst += 2; 135 } 136 if (bytes & 1) { 137 *(uint8_t *)dst = 0; 138 } 139 } 140 141 static size_t INLINE poly1305_min(size_t a, size_t b) { 142 return (a < b) ? a : b; 143 } 144 145 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { 146 poly1305_state_internal *st = poly1305_aligned_state(state); 147 poly1305_power *p; 148 uint64_t r0, r1, r2; 149 uint64_t t0, t1; 150 151 /* clamp key */ 152 t0 = U8TO64_LE(key + 0); 153 t1 = U8TO64_LE(key + 8); 154 r0 = t0 & 0xffc0fffffff; 155 t0 >>= 44; 156 t0 |= t1 << 20; 157 r1 = t0 & 0xfffffc0ffff; 158 t1 >>= 24; 159 r2 = t1 & 0x00ffffffc0f; 160 161 /* store r in un-used space of st->P[1] */ 162 p = &st->P[1]; 163 p->R20.d[1] = (uint32_t)(r0); 164 p->R20.d[3] = (uint32_t)(r0 >> 32); 165 p->R21.d[1] = (uint32_t)(r1); 166 p->R21.d[3] = (uint32_t)(r1 >> 32); 167 p->R22.d[1] = (uint32_t)(r2); 168 p->R22.d[3] = (uint32_t)(r2 >> 32); 169 170 /* store pad */ 171 p->R23.d[1] = U8TO32_LE(key + 16); 172 p->R23.d[3] = U8TO32_LE(key + 20); 173 p->R24.d[1] = U8TO32_LE(key + 24); 174 p->R24.d[3] = U8TO32_LE(key + 28); 175 176 /* H = 0 */ 177 st->H[0] = _mm_setzero_si128(); 178 st->H[1] = _mm_setzero_si128(); 179 st->H[2] = _mm_setzero_si128(); 180 st->H[3] = _mm_setzero_si128(); 181 st->H[4] = _mm_setzero_si128(); 182 183 st->started = 0; 184 st->leftover = 0; 185 } 186 187 static void poly1305_first_block(poly1305_state_internal *st, 188 const uint8_t *m) { 189 const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask); 190 const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5); 191 const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128); 192 xmmi T5, T6; 193 poly1305_power *p; 194 uint128_t d[3]; 195 uint64_t r0, r1, r2; 196 uint64_t r20, r21, r22, s22; 197 uint64_t pad0, pad1; 198 uint64_t c; 199 uint64_t i; 200 201 /* pull out stored info */ 202 p = &st->P[1]; 203 204 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 205 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 206 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 207 pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; 208 pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; 209 210 /* compute powers r^2,r^4 */ 211 r20 = r0; 212 r21 = r1; 213 r22 = r2; 214 for (i = 0; i < 2; i++) { 215 s22 = r22 * (5 << 2); 216 217 d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); 218 d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); 219 d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); 220 221 r20 = lo128(d[0]) & 0xfffffffffff; 222 c = shr128(d[0], 44); 223 d[1] = add128_64(d[1], c); 224 r21 = lo128(d[1]) & 0xfffffffffff; 225 c = shr128(d[1], 44); 226 d[2] = add128_64(d[2], c); 227 r22 = lo128(d[2]) & 0x3ffffffffff; 228 c = shr128(d[2], 42); 229 r20 += c * 5; 230 c = (r20 >> 44); 231 r20 = r20 & 0xfffffffffff; 232 r21 += c; 233 234 p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), 235 _MM_SHUFFLE(1, 0, 1, 0)); 236 p->R21.v = _mm_shuffle_epi32( 237 _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), 238 _MM_SHUFFLE(1, 0, 1, 0)); 239 p->R22.v = 240 _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), 241 _MM_SHUFFLE(1, 0, 1, 0)); 242 p->R23.v = _mm_shuffle_epi32( 243 _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), 244 _MM_SHUFFLE(1, 0, 1, 0)); 245 p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), 246 _MM_SHUFFLE(1, 0, 1, 0)); 247 p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); 248 p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); 249 p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); 250 p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); 251 p--; 252 } 253 254 /* put saved info back */ 255 p = &st->P[1]; 256 p->R20.d[1] = (uint32_t)(r0); 257 p->R20.d[3] = (uint32_t)(r0 >> 32); 258 p->R21.d[1] = (uint32_t)(r1); 259 p->R21.d[3] = (uint32_t)(r1 >> 32); 260 p->R22.d[1] = (uint32_t)(r2); 261 p->R22.d[3] = (uint32_t)(r2 >> 32); 262 p->R23.d[1] = (uint32_t)(pad0); 263 p->R23.d[3] = (uint32_t)(pad0 >> 32); 264 p->R24.d[1] = (uint32_t)(pad1); 265 p->R24.d[3] = (uint32_t)(pad1 >> 32); 266 267 /* H = [Mx,My] */ 268 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), 269 _mm_loadl_epi64((xmmi *)(m + 16))); 270 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), 271 _mm_loadl_epi64((xmmi *)(m + 24))); 272 st->H[0] = _mm_and_si128(MMASK, T5); 273 st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 274 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 275 st->H[2] = _mm_and_si128(MMASK, T5); 276 st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 277 st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 278 } 279 280 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, 281 size_t bytes) { 282 const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask); 283 const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5); 284 const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128); 285 286 poly1305_power *p; 287 xmmi H0, H1, H2, H3, H4; 288 xmmi T0, T1, T2, T3, T4, T5, T6; 289 xmmi M0, M1, M2, M3, M4; 290 xmmi C1, C2; 291 292 H0 = st->H[0]; 293 H1 = st->H[1]; 294 H2 = st->H[2]; 295 H3 = st->H[3]; 296 H4 = st->H[4]; 297 298 while (bytes >= 64) { 299 /* H *= [r^4,r^4] */ 300 p = &st->P[0]; 301 T0 = _mm_mul_epu32(H0, p->R20.v); 302 T1 = _mm_mul_epu32(H0, p->R21.v); 303 T2 = _mm_mul_epu32(H0, p->R22.v); 304 T3 = _mm_mul_epu32(H0, p->R23.v); 305 T4 = _mm_mul_epu32(H0, p->R24.v); 306 T5 = _mm_mul_epu32(H1, p->S24.v); 307 T6 = _mm_mul_epu32(H1, p->R20.v); 308 T0 = _mm_add_epi64(T0, T5); 309 T1 = _mm_add_epi64(T1, T6); 310 T5 = _mm_mul_epu32(H2, p->S23.v); 311 T6 = _mm_mul_epu32(H2, p->S24.v); 312 T0 = _mm_add_epi64(T0, T5); 313 T1 = _mm_add_epi64(T1, T6); 314 T5 = _mm_mul_epu32(H3, p->S22.v); 315 T6 = _mm_mul_epu32(H3, p->S23.v); 316 T0 = _mm_add_epi64(T0, T5); 317 T1 = _mm_add_epi64(T1, T6); 318 T5 = _mm_mul_epu32(H4, p->S21.v); 319 T6 = _mm_mul_epu32(H4, p->S22.v); 320 T0 = _mm_add_epi64(T0, T5); 321 T1 = _mm_add_epi64(T1, T6); 322 T5 = _mm_mul_epu32(H1, p->R21.v); 323 T6 = _mm_mul_epu32(H1, p->R22.v); 324 T2 = _mm_add_epi64(T2, T5); 325 T3 = _mm_add_epi64(T3, T6); 326 T5 = _mm_mul_epu32(H2, p->R20.v); 327 T6 = _mm_mul_epu32(H2, p->R21.v); 328 T2 = _mm_add_epi64(T2, T5); 329 T3 = _mm_add_epi64(T3, T6); 330 T5 = _mm_mul_epu32(H3, p->S24.v); 331 T6 = _mm_mul_epu32(H3, p->R20.v); 332 T2 = _mm_add_epi64(T2, T5); 333 T3 = _mm_add_epi64(T3, T6); 334 T5 = _mm_mul_epu32(H4, p->S23.v); 335 T6 = _mm_mul_epu32(H4, p->S24.v); 336 T2 = _mm_add_epi64(T2, T5); 337 T3 = _mm_add_epi64(T3, T6); 338 T5 = _mm_mul_epu32(H1, p->R23.v); 339 T4 = _mm_add_epi64(T4, T5); 340 T5 = _mm_mul_epu32(H2, p->R22.v); 341 T4 = _mm_add_epi64(T4, T5); 342 T5 = _mm_mul_epu32(H3, p->R21.v); 343 T4 = _mm_add_epi64(T4, T5); 344 T5 = _mm_mul_epu32(H4, p->R20.v); 345 T4 = _mm_add_epi64(T4, T5); 346 347 /* H += [Mx,My]*[r^2,r^2] */ 348 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), 349 _mm_loadl_epi64((xmmi *)(m + 16))); 350 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), 351 _mm_loadl_epi64((xmmi *)(m + 24))); 352 M0 = _mm_and_si128(MMASK, T5); 353 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 354 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 355 M2 = _mm_and_si128(MMASK, T5); 356 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 357 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 358 359 p = &st->P[1]; 360 T5 = _mm_mul_epu32(M0, p->R20.v); 361 T6 = _mm_mul_epu32(M0, p->R21.v); 362 T0 = _mm_add_epi64(T0, T5); 363 T1 = _mm_add_epi64(T1, T6); 364 T5 = _mm_mul_epu32(M1, p->S24.v); 365 T6 = _mm_mul_epu32(M1, p->R20.v); 366 T0 = _mm_add_epi64(T0, T5); 367 T1 = _mm_add_epi64(T1, T6); 368 T5 = _mm_mul_epu32(M2, p->S23.v); 369 T6 = _mm_mul_epu32(M2, p->S24.v); 370 T0 = _mm_add_epi64(T0, T5); 371 T1 = _mm_add_epi64(T1, T6); 372 T5 = _mm_mul_epu32(M3, p->S22.v); 373 T6 = _mm_mul_epu32(M3, p->S23.v); 374 T0 = _mm_add_epi64(T0, T5); 375 T1 = _mm_add_epi64(T1, T6); 376 T5 = _mm_mul_epu32(M4, p->S21.v); 377 T6 = _mm_mul_epu32(M4, p->S22.v); 378 T0 = _mm_add_epi64(T0, T5); 379 T1 = _mm_add_epi64(T1, T6); 380 T5 = _mm_mul_epu32(M0, p->R22.v); 381 T6 = _mm_mul_epu32(M0, p->R23.v); 382 T2 = _mm_add_epi64(T2, T5); 383 T3 = _mm_add_epi64(T3, T6); 384 T5 = _mm_mul_epu32(M1, p->R21.v); 385 T6 = _mm_mul_epu32(M1, p->R22.v); 386 T2 = _mm_add_epi64(T2, T5); 387 T3 = _mm_add_epi64(T3, T6); 388 T5 = _mm_mul_epu32(M2, p->R20.v); 389 T6 = _mm_mul_epu32(M2, p->R21.v); 390 T2 = _mm_add_epi64(T2, T5); 391 T3 = _mm_add_epi64(T3, T6); 392 T5 = _mm_mul_epu32(M3, p->S24.v); 393 T6 = _mm_mul_epu32(M3, p->R20.v); 394 T2 = _mm_add_epi64(T2, T5); 395 T3 = _mm_add_epi64(T3, T6); 396 T5 = _mm_mul_epu32(M4, p->S23.v); 397 T6 = _mm_mul_epu32(M4, p->S24.v); 398 T2 = _mm_add_epi64(T2, T5); 399 T3 = _mm_add_epi64(T3, T6); 400 T5 = _mm_mul_epu32(M0, p->R24.v); 401 T4 = _mm_add_epi64(T4, T5); 402 T5 = _mm_mul_epu32(M1, p->R23.v); 403 T4 = _mm_add_epi64(T4, T5); 404 T5 = _mm_mul_epu32(M2, p->R22.v); 405 T4 = _mm_add_epi64(T4, T5); 406 T5 = _mm_mul_epu32(M3, p->R21.v); 407 T4 = _mm_add_epi64(T4, T5); 408 T5 = _mm_mul_epu32(M4, p->R20.v); 409 T4 = _mm_add_epi64(T4, T5); 410 411 /* H += [Mx,My] */ 412 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)), 413 _mm_loadl_epi64((xmmi *)(m + 48))); 414 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)), 415 _mm_loadl_epi64((xmmi *)(m + 56))); 416 M0 = _mm_and_si128(MMASK, T5); 417 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 418 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 419 M2 = _mm_and_si128(MMASK, T5); 420 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 421 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 422 423 T0 = _mm_add_epi64(T0, M0); 424 T1 = _mm_add_epi64(T1, M1); 425 T2 = _mm_add_epi64(T2, M2); 426 T3 = _mm_add_epi64(T3, M3); 427 T4 = _mm_add_epi64(T4, M4); 428 429 /* reduce */ 430 C1 = _mm_srli_epi64(T0, 26); 431 C2 = _mm_srli_epi64(T3, 26); 432 T0 = _mm_and_si128(T0, MMASK); 433 T3 = _mm_and_si128(T3, MMASK); 434 T1 = _mm_add_epi64(T1, C1); 435 T4 = _mm_add_epi64(T4, C2); 436 C1 = _mm_srli_epi64(T1, 26); 437 C2 = _mm_srli_epi64(T4, 26); 438 T1 = _mm_and_si128(T1, MMASK); 439 T4 = _mm_and_si128(T4, MMASK); 440 T2 = _mm_add_epi64(T2, C1); 441 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 442 C1 = _mm_srli_epi64(T2, 26); 443 C2 = _mm_srli_epi64(T0, 26); 444 T2 = _mm_and_si128(T2, MMASK); 445 T0 = _mm_and_si128(T0, MMASK); 446 T3 = _mm_add_epi64(T3, C1); 447 T1 = _mm_add_epi64(T1, C2); 448 C1 = _mm_srli_epi64(T3, 26); 449 T3 = _mm_and_si128(T3, MMASK); 450 T4 = _mm_add_epi64(T4, C1); 451 452 /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */ 453 H0 = T0; 454 H1 = T1; 455 H2 = T2; 456 H3 = T3; 457 H4 = T4; 458 459 m += 64; 460 bytes -= 64; 461 } 462 463 st->H[0] = H0; 464 st->H[1] = H1; 465 st->H[2] = H2; 466 st->H[3] = H3; 467 st->H[4] = H4; 468 } 469 470 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, 471 size_t bytes) { 472 const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask); 473 const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128); 474 const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5); 475 476 poly1305_power *p; 477 xmmi H0, H1, H2, H3, H4; 478 xmmi M0, M1, M2, M3, M4; 479 xmmi T0, T1, T2, T3, T4, T5, T6; 480 xmmi C1, C2; 481 482 uint64_t r0, r1, r2; 483 uint64_t t0, t1, t2, t3, t4; 484 uint64_t c; 485 size_t consumed = 0; 486 487 H0 = st->H[0]; 488 H1 = st->H[1]; 489 H2 = st->H[2]; 490 H3 = st->H[3]; 491 H4 = st->H[4]; 492 493 /* p = [r^2,r^2] */ 494 p = &st->P[1]; 495 496 if (bytes >= 32) { 497 /* H *= [r^2,r^2] */ 498 T0 = _mm_mul_epu32(H0, p->R20.v); 499 T1 = _mm_mul_epu32(H0, p->R21.v); 500 T2 = _mm_mul_epu32(H0, p->R22.v); 501 T3 = _mm_mul_epu32(H0, p->R23.v); 502 T4 = _mm_mul_epu32(H0, p->R24.v); 503 T5 = _mm_mul_epu32(H1, p->S24.v); 504 T6 = _mm_mul_epu32(H1, p->R20.v); 505 T0 = _mm_add_epi64(T0, T5); 506 T1 = _mm_add_epi64(T1, T6); 507 T5 = _mm_mul_epu32(H2, p->S23.v); 508 T6 = _mm_mul_epu32(H2, p->S24.v); 509 T0 = _mm_add_epi64(T0, T5); 510 T1 = _mm_add_epi64(T1, T6); 511 T5 = _mm_mul_epu32(H3, p->S22.v); 512 T6 = _mm_mul_epu32(H3, p->S23.v); 513 T0 = _mm_add_epi64(T0, T5); 514 T1 = _mm_add_epi64(T1, T6); 515 T5 = _mm_mul_epu32(H4, p->S21.v); 516 T6 = _mm_mul_epu32(H4, p->S22.v); 517 T0 = _mm_add_epi64(T0, T5); 518 T1 = _mm_add_epi64(T1, T6); 519 T5 = _mm_mul_epu32(H1, p->R21.v); 520 T6 = _mm_mul_epu32(H1, p->R22.v); 521 T2 = _mm_add_epi64(T2, T5); 522 T3 = _mm_add_epi64(T3, T6); 523 T5 = _mm_mul_epu32(H2, p->R20.v); 524 T6 = _mm_mul_epu32(H2, p->R21.v); 525 T2 = _mm_add_epi64(T2, T5); 526 T3 = _mm_add_epi64(T3, T6); 527 T5 = _mm_mul_epu32(H3, p->S24.v); 528 T6 = _mm_mul_epu32(H3, p->R20.v); 529 T2 = _mm_add_epi64(T2, T5); 530 T3 = _mm_add_epi64(T3, T6); 531 T5 = _mm_mul_epu32(H4, p->S23.v); 532 T6 = _mm_mul_epu32(H4, p->S24.v); 533 T2 = _mm_add_epi64(T2, T5); 534 T3 = _mm_add_epi64(T3, T6); 535 T5 = _mm_mul_epu32(H1, p->R23.v); 536 T4 = _mm_add_epi64(T4, T5); 537 T5 = _mm_mul_epu32(H2, p->R22.v); 538 T4 = _mm_add_epi64(T4, T5); 539 T5 = _mm_mul_epu32(H3, p->R21.v); 540 T4 = _mm_add_epi64(T4, T5); 541 T5 = _mm_mul_epu32(H4, p->R20.v); 542 T4 = _mm_add_epi64(T4, T5); 543 544 /* H += [Mx,My] */ 545 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), 546 _mm_loadl_epi64((xmmi *)(m + 16))); 547 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), 548 _mm_loadl_epi64((xmmi *)(m + 24))); 549 M0 = _mm_and_si128(MMASK, T5); 550 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 551 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 552 M2 = _mm_and_si128(MMASK, T5); 553 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 554 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 555 556 T0 = _mm_add_epi64(T0, M0); 557 T1 = _mm_add_epi64(T1, M1); 558 T2 = _mm_add_epi64(T2, M2); 559 T3 = _mm_add_epi64(T3, M3); 560 T4 = _mm_add_epi64(T4, M4); 561 562 /* reduce */ 563 C1 = _mm_srli_epi64(T0, 26); 564 C2 = _mm_srli_epi64(T3, 26); 565 T0 = _mm_and_si128(T0, MMASK); 566 T3 = _mm_and_si128(T3, MMASK); 567 T1 = _mm_add_epi64(T1, C1); 568 T4 = _mm_add_epi64(T4, C2); 569 C1 = _mm_srli_epi64(T1, 26); 570 C2 = _mm_srli_epi64(T4, 26); 571 T1 = _mm_and_si128(T1, MMASK); 572 T4 = _mm_and_si128(T4, MMASK); 573 T2 = _mm_add_epi64(T2, C1); 574 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 575 C1 = _mm_srli_epi64(T2, 26); 576 C2 = _mm_srli_epi64(T0, 26); 577 T2 = _mm_and_si128(T2, MMASK); 578 T0 = _mm_and_si128(T0, MMASK); 579 T3 = _mm_add_epi64(T3, C1); 580 T1 = _mm_add_epi64(T1, C2); 581 C1 = _mm_srli_epi64(T3, 26); 582 T3 = _mm_and_si128(T3, MMASK); 583 T4 = _mm_add_epi64(T4, C1); 584 585 /* H = (H*[r^2,r^2] + [Mx,My]) */ 586 H0 = T0; 587 H1 = T1; 588 H2 = T2; 589 H3 = T3; 590 H4 = T4; 591 592 consumed = 32; 593 } 594 595 /* finalize, H *= [r^2,r] */ 596 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 597 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 598 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 599 600 p->R20.d[2] = (uint32_t)(r0)&0x3ffffff; 601 p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; 602 p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; 603 p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; 604 p->R24.d[2] = (uint32_t)((r2 >> 16)); 605 p->S21.d[2] = p->R21.d[2] * 5; 606 p->S22.d[2] = p->R22.d[2] * 5; 607 p->S23.d[2] = p->R23.d[2] * 5; 608 p->S24.d[2] = p->R24.d[2] * 5; 609 610 /* H *= [r^2,r] */ 611 T0 = _mm_mul_epu32(H0, p->R20.v); 612 T1 = _mm_mul_epu32(H0, p->R21.v); 613 T2 = _mm_mul_epu32(H0, p->R22.v); 614 T3 = _mm_mul_epu32(H0, p->R23.v); 615 T4 = _mm_mul_epu32(H0, p->R24.v); 616 T5 = _mm_mul_epu32(H1, p->S24.v); 617 T6 = _mm_mul_epu32(H1, p->R20.v); 618 T0 = _mm_add_epi64(T0, T5); 619 T1 = _mm_add_epi64(T1, T6); 620 T5 = _mm_mul_epu32(H2, p->S23.v); 621 T6 = _mm_mul_epu32(H2, p->S24.v); 622 T0 = _mm_add_epi64(T0, T5); 623 T1 = _mm_add_epi64(T1, T6); 624 T5 = _mm_mul_epu32(H3, p->S22.v); 625 T6 = _mm_mul_epu32(H3, p->S23.v); 626 T0 = _mm_add_epi64(T0, T5); 627 T1 = _mm_add_epi64(T1, T6); 628 T5 = _mm_mul_epu32(H4, p->S21.v); 629 T6 = _mm_mul_epu32(H4, p->S22.v); 630 T0 = _mm_add_epi64(T0, T5); 631 T1 = _mm_add_epi64(T1, T6); 632 T5 = _mm_mul_epu32(H1, p->R21.v); 633 T6 = _mm_mul_epu32(H1, p->R22.v); 634 T2 = _mm_add_epi64(T2, T5); 635 T3 = _mm_add_epi64(T3, T6); 636 T5 = _mm_mul_epu32(H2, p->R20.v); 637 T6 = _mm_mul_epu32(H2, p->R21.v); 638 T2 = _mm_add_epi64(T2, T5); 639 T3 = _mm_add_epi64(T3, T6); 640 T5 = _mm_mul_epu32(H3, p->S24.v); 641 T6 = _mm_mul_epu32(H3, p->R20.v); 642 T2 = _mm_add_epi64(T2, T5); 643 T3 = _mm_add_epi64(T3, T6); 644 T5 = _mm_mul_epu32(H4, p->S23.v); 645 T6 = _mm_mul_epu32(H4, p->S24.v); 646 T2 = _mm_add_epi64(T2, T5); 647 T3 = _mm_add_epi64(T3, T6); 648 T5 = _mm_mul_epu32(H1, p->R23.v); 649 T4 = _mm_add_epi64(T4, T5); 650 T5 = _mm_mul_epu32(H2, p->R22.v); 651 T4 = _mm_add_epi64(T4, T5); 652 T5 = _mm_mul_epu32(H3, p->R21.v); 653 T4 = _mm_add_epi64(T4, T5); 654 T5 = _mm_mul_epu32(H4, p->R20.v); 655 T4 = _mm_add_epi64(T4, T5); 656 657 C1 = _mm_srli_epi64(T0, 26); 658 C2 = _mm_srli_epi64(T3, 26); 659 T0 = _mm_and_si128(T0, MMASK); 660 T3 = _mm_and_si128(T3, MMASK); 661 T1 = _mm_add_epi64(T1, C1); 662 T4 = _mm_add_epi64(T4, C2); 663 C1 = _mm_srli_epi64(T1, 26); 664 C2 = _mm_srli_epi64(T4, 26); 665 T1 = _mm_and_si128(T1, MMASK); 666 T4 = _mm_and_si128(T4, MMASK); 667 T2 = _mm_add_epi64(T2, C1); 668 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 669 C1 = _mm_srli_epi64(T2, 26); 670 C2 = _mm_srli_epi64(T0, 26); 671 T2 = _mm_and_si128(T2, MMASK); 672 T0 = _mm_and_si128(T0, MMASK); 673 T3 = _mm_add_epi64(T3, C1); 674 T1 = _mm_add_epi64(T1, C2); 675 C1 = _mm_srli_epi64(T3, 26); 676 T3 = _mm_and_si128(T3, MMASK); 677 T4 = _mm_add_epi64(T4, C1); 678 679 /* H = H[0]+H[1] */ 680 H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); 681 H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); 682 H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); 683 H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); 684 H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); 685 686 t0 = _mm_cvtsi128_si32(H0); 687 c = (t0 >> 26); 688 t0 &= 0x3ffffff; 689 t1 = _mm_cvtsi128_si32(H1) + c; 690 c = (t1 >> 26); 691 t1 &= 0x3ffffff; 692 t2 = _mm_cvtsi128_si32(H2) + c; 693 c = (t2 >> 26); 694 t2 &= 0x3ffffff; 695 t3 = _mm_cvtsi128_si32(H3) + c; 696 c = (t3 >> 26); 697 t3 &= 0x3ffffff; 698 t4 = _mm_cvtsi128_si32(H4) + c; 699 c = (t4 >> 26); 700 t4 &= 0x3ffffff; 701 t0 = t0 + (c * 5); 702 c = (t0 >> 26); 703 t0 &= 0x3ffffff; 704 t1 = t1 + c; 705 706 st->HH[0] = ((t0) | (t1 << 26)) & 0xfffffffffffull; 707 st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull; 708 st->HH[2] = ((t3 >> 10) | (t4 << 16)) & 0x3ffffffffffull; 709 710 return consumed; 711 } 712 713 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, 714 size_t bytes) { 715 poly1305_state_internal *st = poly1305_aligned_state(state); 716 size_t want; 717 718 /* need at least 32 initial bytes to start the accelerated branch */ 719 if (!st->started) { 720 if ((st->leftover == 0) && (bytes > 32)) { 721 poly1305_first_block(st, m); 722 m += 32; 723 bytes -= 32; 724 } else { 725 want = poly1305_min(32 - st->leftover, bytes); 726 poly1305_block_copy(st->buffer + st->leftover, m, want); 727 bytes -= want; 728 m += want; 729 st->leftover += want; 730 if ((st->leftover < 32) || (bytes == 0)) { 731 return; 732 } 733 poly1305_first_block(st, st->buffer); 734 st->leftover = 0; 735 } 736 st->started = 1; 737 } 738 739 /* handle leftover */ 740 if (st->leftover) { 741 want = poly1305_min(64 - st->leftover, bytes); 742 poly1305_block_copy(st->buffer + st->leftover, m, want); 743 bytes -= want; 744 m += want; 745 st->leftover += want; 746 if (st->leftover < 64) { 747 return; 748 } 749 poly1305_blocks(st, st->buffer, 64); 750 st->leftover = 0; 751 } 752 753 /* process 64 byte blocks */ 754 if (bytes >= 64) { 755 want = (bytes & ~63); 756 poly1305_blocks(st, m, want); 757 m += want; 758 bytes -= want; 759 } 760 761 if (bytes) { 762 poly1305_block_copy(st->buffer + st->leftover, m, bytes); 763 st->leftover += bytes; 764 } 765 } 766 767 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { 768 poly1305_state_internal *st = poly1305_aligned_state(state); 769 size_t leftover = st->leftover; 770 uint8_t *m = st->buffer; 771 uint128_t d[3]; 772 uint64_t h0, h1, h2; 773 uint64_t t0, t1; 774 uint64_t g0, g1, g2, c, nc; 775 uint64_t r0, r1, r2, s1, s2; 776 poly1305_power *p; 777 778 if (st->started) { 779 size_t consumed = poly1305_combine(st, m, leftover); 780 leftover -= consumed; 781 m += consumed; 782 } 783 784 /* st->HH will either be 0 or have the combined result */ 785 h0 = st->HH[0]; 786 h1 = st->HH[1]; 787 h2 = st->HH[2]; 788 789 p = &st->P[1]; 790 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 791 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 792 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 793 s1 = r1 * (5 << 2); 794 s2 = r2 * (5 << 2); 795 796 if (leftover < 16) { 797 goto poly1305_donna_atmost15bytes; 798 } 799 800 poly1305_donna_atleast16bytes: 801 t0 = U8TO64_LE(m + 0); 802 t1 = U8TO64_LE(m + 8); 803 h0 += t0 & 0xfffffffffff; 804 t0 = shr128_pair(t1, t0, 44); 805 h1 += t0 & 0xfffffffffff; 806 h2 += (t1 >> 24) | ((uint64_t)1 << 40); 807 808 poly1305_donna_mul: 809 d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), 810 mul64x64_128(h2, s1)); 811 d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), 812 mul64x64_128(h2, s2)); 813 d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), 814 mul64x64_128(h2, r0)); 815 h0 = lo128(d[0]) & 0xfffffffffff; 816 c = shr128(d[0], 44); 817 d[1] = add128_64(d[1], c); 818 h1 = lo128(d[1]) & 0xfffffffffff; 819 c = shr128(d[1], 44); 820 d[2] = add128_64(d[2], c); 821 h2 = lo128(d[2]) & 0x3ffffffffff; 822 c = shr128(d[2], 42); 823 h0 += c * 5; 824 825 m += 16; 826 leftover -= 16; 827 if (leftover >= 16) { 828 goto poly1305_donna_atleast16bytes; 829 } 830 831 /* final bytes */ 832 poly1305_donna_atmost15bytes: 833 if (!leftover) { 834 goto poly1305_donna_finish; 835 } 836 837 m[leftover++] = 1; 838 poly1305_block_zero(m + leftover, 16 - leftover); 839 leftover = 16; 840 841 t0 = U8TO64_LE(m + 0); 842 t1 = U8TO64_LE(m + 8); 843 h0 += t0 & 0xfffffffffff; 844 t0 = shr128_pair(t1, t0, 44); 845 h1 += t0 & 0xfffffffffff; 846 h2 += (t1 >> 24); 847 848 goto poly1305_donna_mul; 849 850 poly1305_donna_finish: 851 c = (h0 >> 44); 852 h0 &= 0xfffffffffff; 853 h1 += c; 854 c = (h1 >> 44); 855 h1 &= 0xfffffffffff; 856 h2 += c; 857 c = (h2 >> 42); 858 h2 &= 0x3ffffffffff; 859 h0 += c * 5; 860 861 g0 = h0 + 5; 862 c = (g0 >> 44); 863 g0 &= 0xfffffffffff; 864 g1 = h1 + c; 865 c = (g1 >> 44); 866 g1 &= 0xfffffffffff; 867 g2 = h2 + c - ((uint64_t)1 << 42); 868 869 c = (g2 >> 63) - 1; 870 nc = ~c; 871 h0 = (h0 & nc) | (g0 & c); 872 h1 = (h1 & nc) | (g1 & c); 873 h2 = (h2 & nc) | (g2 & c); 874 875 /* pad */ 876 t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; 877 t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; 878 h0 += (t0 & 0xfffffffffff); 879 c = (h0 >> 44); 880 h0 &= 0xfffffffffff; 881 t0 = shr128_pair(t1, t0, 44); 882 h1 += (t0 & 0xfffffffffff) + c; 883 c = (h1 >> 44); 884 h1 &= 0xfffffffffff; 885 t1 = (t1 >> 24); 886 h2 += (t1)+c; 887 888 U64TO8_LE(mac + 0, ((h0) | (h1 << 44))); 889 U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24))); 890 } 891 892 #endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */ 893