Home | History | Annotate | Download | only in impl
      1 /*
      2    BLAKE2 reference source code package - optimized C implementations
      3 
      4    Copyright 2012, Samuel Neves <sneves (at) dei.uc.pt>.  You may use this under the
      5    terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
      6    your option.  The terms of these licenses can be found at:
      7 
      8    - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
      9    - OpenSSL license   : https://www.openssl.org/source/license.html
     10    - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
     11 
     12    More information about the BLAKE2 hash function can be found at
     13    https://blake2.net.
     14 */
     15 #pragma once
     16 #ifndef __BLAKE2B_ROUND_H__
     17 #define __BLAKE2B_ROUND_H__
     18 
     19 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
     20 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
     21 
     22 #define TOF(reg) _mm_castsi128_ps((reg))
     23 #define TOI(reg) _mm_castps_si128((reg))
     24 
     25 #define LIKELY(x) __builtin_expect((x),1)
     26 
     27 
     28 /* Microarchitecture-specific macros */
     29 #ifndef HAVE_XOP
     30 #ifdef HAVE_SSSE3
     31 #define _mm_roti_epi64(x, c) \
     32     (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
     33     : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
     34     : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
     35     : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
     36     : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
     37 #else
     38 #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
     39 #endif
     40 #else
     41 /* ... */
     42 #endif
     43 
     44 
     45 
     46 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
     47   row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
     48   row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
     49   \
     50   row4l = _mm_xor_si128(row4l, row1l); \
     51   row4h = _mm_xor_si128(row4h, row1h); \
     52   \
     53   row4l = _mm_roti_epi64(row4l, -32); \
     54   row4h = _mm_roti_epi64(row4h, -32); \
     55   \
     56   row3l = _mm_add_epi64(row3l, row4l); \
     57   row3h = _mm_add_epi64(row3h, row4h); \
     58   \
     59   row2l = _mm_xor_si128(row2l, row3l); \
     60   row2h = _mm_xor_si128(row2h, row3h); \
     61   \
     62   row2l = _mm_roti_epi64(row2l, -24); \
     63   row2h = _mm_roti_epi64(row2h, -24); \
     64 
     65 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
     66   row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
     67   row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
     68   \
     69   row4l = _mm_xor_si128(row4l, row1l); \
     70   row4h = _mm_xor_si128(row4h, row1h); \
     71   \
     72   row4l = _mm_roti_epi64(row4l, -16); \
     73   row4h = _mm_roti_epi64(row4h, -16); \
     74   \
     75   row3l = _mm_add_epi64(row3l, row4l); \
     76   row3h = _mm_add_epi64(row3h, row4h); \
     77   \
     78   row2l = _mm_xor_si128(row2l, row3l); \
     79   row2h = _mm_xor_si128(row2h, row3h); \
     80   \
     81   row2l = _mm_roti_epi64(row2l, -63); \
     82   row2h = _mm_roti_epi64(row2h, -63); \
     83 
     84 #if defined(HAVE_SSSE3)
     85 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
     86   t0 = _mm_alignr_epi8(row2h, row2l, 8); \
     87   t1 = _mm_alignr_epi8(row2l, row2h, 8); \
     88   row2l = t0; \
     89   row2h = t1; \
     90   \
     91   t0 = row3l; \
     92   row3l = row3h; \
     93   row3h = t0;    \
     94   \
     95   t0 = _mm_alignr_epi8(row4h, row4l, 8); \
     96   t1 = _mm_alignr_epi8(row4l, row4h, 8); \
     97   row4l = t1; \
     98   row4h = t0;
     99 
    100 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
    101   t0 = _mm_alignr_epi8(row2l, row2h, 8); \
    102   t1 = _mm_alignr_epi8(row2h, row2l, 8); \
    103   row2l = t0; \
    104   row2h = t1; \
    105   \
    106   t0 = row3l; \
    107   row3l = row3h; \
    108   row3h = t0; \
    109   \
    110   t0 = _mm_alignr_epi8(row4l, row4h, 8); \
    111   t1 = _mm_alignr_epi8(row4h, row4l, 8); \
    112   row4l = t1; \
    113   row4h = t0;
    114 #else
    115 
    116 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
    117   t0 = row4l;\
    118   t1 = row2l;\
    119   row4l = row3l;\
    120   row3l = row3h;\
    121   row3h = row4l;\
    122   row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
    123   row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
    124   row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
    125   row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
    126 
    127 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
    128   t0 = row3l;\
    129   row3l = row3h;\
    130   row3h = t0;\
    131   t0 = row2l;\
    132   t1 = row4l;\
    133   row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
    134   row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
    135   row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
    136   row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
    137 
    138 #endif
    139 
    140 #if defined(HAVE_SSE41)
    141 #include "blake2b-load-sse41.h"
    142 #else
    143 #include "blake2b-load-sse2.h"
    144 #endif
    145 
    146 #define ROUND(r) \
    147   LOAD_MSG_ ##r ##_1(b0, b1); \
    148   G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
    149   LOAD_MSG_ ##r ##_2(b0, b1); \
    150   G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
    151   DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
    152   LOAD_MSG_ ##r ##_3(b0, b1); \
    153   G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
    154   LOAD_MSG_ ##r ##_4(b0, b1); \
    155   G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
    156   UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
    157 
    158 #endif
    159 
    160