Home | History | Annotate | Download | only in AVX
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog (at) gmail.com)
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_PACKET_MATH_AVX_H
     11 #define EIGEN_PACKET_MATH_AVX_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
     18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
     19 #endif
     20 
     21 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
     22 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
     23 #endif
     24 
     25 #ifdef __FMA__
     26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     28 #endif
     29 #endif
     30 
     31 typedef __m256  Packet8f;
     32 typedef __m256i Packet8i;
     33 typedef __m256d Packet4d;
     34 
     35 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
     36 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
     37 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
     38 
     39 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
     40   const Packet8f p8f_##NAME = pset1<Packet8f>(X)
     41 
     42 #define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
     43   const Packet4d p4d_##NAME = pset1<Packet4d>(X)
     44 
     45 #define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
     46   const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
     47 
     48 #define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
     49   const Packet8i p8i_##NAME = pset1<Packet8i>(X)
     50 
     51 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
     52 // to leverage AVX512 instructions.
     53 #ifndef EIGEN_VECTORIZE_AVX512
     54 template<> struct packet_traits<float>  : default_packet_traits
     55 {
     56   typedef Packet8f type;
     57   typedef Packet4f half;
     58   enum {
     59     Vectorizable = 1,
     60     AlignedOnScalar = 1,
     61     size=8,
     62     HasHalfPacket = 1,
     63 
     64     HasDiv  = 1,
     65     HasSin  = EIGEN_FAST_MATH,
     66     HasCos  = 0,
     67     HasLog  = 1,
     68     HasExp  = 1,
     69     HasSqrt = 1,
     70     HasRsqrt = 1,
     71     HasTanh  = EIGEN_FAST_MATH,
     72     HasBlend = 1,
     73     HasRound = 1,
     74     HasFloor = 1,
     75     HasCeil = 1
     76   };
     77 };
     78 template<> struct packet_traits<double> : default_packet_traits
     79 {
     80   typedef Packet4d type;
     81   typedef Packet2d half;
     82   enum {
     83     Vectorizable = 1,
     84     AlignedOnScalar = 1,
     85     size=4,
     86     HasHalfPacket = 1,
     87 
     88     HasDiv  = 1,
     89     HasExp  = 1,
     90     HasSqrt = 1,
     91     HasRsqrt = 1,
     92     HasBlend = 1,
     93     HasRound = 1,
     94     HasFloor = 1,
     95     HasCeil = 1
     96   };
     97 };
     98 #endif
     99 
    100 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
    101 template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
    102 
    103 /* Proper support for integers is only provided by AVX2. In the meantime, we'll
    104    use SSE instructions and packets to deal with integers.
    105 template<> struct packet_traits<int>    : default_packet_traits
    106 {
    107   typedef Packet8i type;
    108   enum {
    109     Vectorizable = 1,
    110     AlignedOnScalar = 1,
    111     size=8
    112   };
    113 };
    114 */
    115 
    116 template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
    117 template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
    118 template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
    119 
    120 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
    121 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
    122 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
    123 
    124 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
    125 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
    126 
    127 template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
    128 template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
    129 
    130 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
    131 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
    132 
    133 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
    134 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
    135 
    136 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
    137 {
    138   return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
    139 }
    140 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
    141 {
    142   return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
    143 }
    144 
    145 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
    146 template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
    147 template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
    148 
    149 template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
    150 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
    151 
    152 
    153 template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
    154 template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
    155 template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
    156 { eigen_assert(false && "packet integer division are not supported by AVX");
    157   return pset1<Packet8i>(0);
    158 }
    159 
    160 #ifdef __FMA__
    161 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
    162 #if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
    163   // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
    164   // and gcc stupidly generates a vfmadd132ps instruction,
    165   // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
    166   // the result of the product.
    167   Packet8f res = c;
    168   __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
    169   return res;
    170 #else
    171   return _mm256_fmadd_ps(a,b,c);
    172 #endif
    173 }
    174 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
    175 #if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
    176   // see above
    177   Packet4d res = c;
    178   __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
    179   return res;
    180 #else
    181   return _mm256_fmadd_pd(a,b,c);
    182 #endif
    183 }
    184 #endif
    185 
    186 template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
    187 template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
    188 
    189 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
    190 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
    191 
    192 template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
    193 template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
    194 
    195 template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
    196 template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
    197 
    198 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
    199 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
    200 
    201 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
    202 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
    203 
    204 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
    205 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
    206 
    207 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
    208 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
    209 
    210 template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
    211 template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
    212 
    213 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
    214 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
    215 template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
    216 
    217 template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
    218 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
    219 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
    220 
    221 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
    222 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
    223 {
    224   // TODO try to find a way to avoid the need of a temporary register
    225 //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
    226 //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
    227 //   return _mm256_unpacklo_ps(tmp,tmp);
    228 
    229   // _mm256_insertf128_ps is very slow on Haswell, thus:
    230   Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
    231   // mimic an "inplace" permutation of the lower 128bits using a blend
    232   tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
    233   // then we can perform a consistent permutation on the global register to get everything in shape:
    234   return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
    235 }
    236 // Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
    237 template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
    238 {
    239   Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
    240   return  _mm256_permute_pd(tmp, 3<<2);
    241 }
    242 
    243 // Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
    244 template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
    245 {
    246   Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
    247   return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
    248 }
    249 
    250 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
    251 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
    252 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
    253 
    254 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
    255 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
    256 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
    257 
    258 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
    259 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
    260 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
    261 {
    262   return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
    263                        from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
    264 }
    265 template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
    266 {
    267   return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
    268 }
    269 
    270 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
    271 {
    272   __m128 low = _mm256_extractf128_ps(from, 0);
    273   to[stride*0] = _mm_cvtss_f32(low);
    274   to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
    275   to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
    276   to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
    277 
    278   __m128 high = _mm256_extractf128_ps(from, 1);
    279   to[stride*4] = _mm_cvtss_f32(high);
    280   to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
    281   to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
    282   to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
    283 }
    284 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
    285 {
    286   __m128d low = _mm256_extractf128_pd(from, 0);
    287   to[stride*0] = _mm_cvtsd_f64(low);
    288   to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
    289   __m128d high = _mm256_extractf128_pd(from, 1);
    290   to[stride*2] = _mm_cvtsd_f64(high);
    291   to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
    292 }
    293 
    294 template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
    295 {
    296   Packet8f pa = pset1<Packet8f>(a);
    297   pstore(to, pa);
    298 }
    299 template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
    300 {
    301   Packet4d pa = pset1<Packet4d>(a);
    302   pstore(to, pa);
    303 }
    304 template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
    305 {
    306   Packet8i pa = pset1<Packet8i>(a);
    307   pstore(to, pa);
    308 }
    309 
    310 #ifndef EIGEN_VECTORIZE_AVX512
    311 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
    312 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
    313 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
    314 #endif
    315 
    316 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
    317   return _mm_cvtss_f32(_mm256_castps256_ps128(a));
    318 }
    319 template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
    320   return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
    321 }
    322 template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
    323   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
    324 }
    325 
    326 
    327 template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
    328 {
    329   __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
    330   return _mm256_permute2f128_ps(tmp, tmp, 1);
    331 }
    332 template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
    333 {
    334    __m256d tmp = _mm256_shuffle_pd(a,a,5);
    335   return _mm256_permute2f128_pd(tmp, tmp, 1);
    336 
    337   __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
    338     return _mm256_permute_pd(swap_halves,5);
    339 }
    340 
    341 // pabs should be ok
    342 template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
    343 {
    344   const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
    345   return _mm256_and_ps(a,mask);
    346 }
    347 template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
    348 {
    349   const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
    350   return _mm256_and_pd(a,mask);
    351 }
    352 
    353 // preduxp should be ok
    354 // FIXME: why is this ok? why isn't the simply implementation working as expected?
    355 template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
    356 {
    357     __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
    358     __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
    359     __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
    360     __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
    361 
    362     __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
    363     __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
    364     __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
    365     __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
    366 
    367     __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
    368     __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
    369     __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
    370     __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
    371 
    372     __m256 sum1 = _mm256_add_ps(perm1, hsum5);
    373     __m256 sum2 = _mm256_add_ps(perm2, hsum6);
    374     __m256 sum3 = _mm256_add_ps(perm3, hsum7);
    375     __m256 sum4 = _mm256_add_ps(perm4, hsum8);
    376 
    377     __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
    378     __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
    379 
    380     __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
    381     return final;
    382 }
    383 template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
    384 {
    385  Packet4d tmp0, tmp1;
    386 
    387   tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
    388   tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
    389 
    390   tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
    391   tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
    392 
    393   return _mm256_blend_pd(tmp0, tmp1, 0xC);
    394 }
    395 
    396 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
    397 {
    398   return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
    399 }
    400 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
    401 {
    402   return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
    403 }
    404 
    405 template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
    406 {
    407   return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
    408 }
    409 
    410 template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
    411 {
    412   Packet8f tmp;
    413   tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
    414   tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
    415   return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
    416 }
    417 template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
    418 {
    419   Packet4d tmp;
    420   tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
    421   return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
    422 }
    423 
    424 template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
    425 {
    426   Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
    427   tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
    428   return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
    429 }
    430 template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
    431 {
    432   Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
    433   return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
    434 }
    435 
    436 template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
    437 {
    438   Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
    439   tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
    440   return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
    441 }
    442 
    443 template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
    444 {
    445   Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
    446   return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
    447 }
    448 
    449 
    450 template<int Offset>
    451 struct palign_impl<Offset,Packet8f>
    452 {
    453   static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
    454   {
    455     if (Offset==1)
    456     {
    457       first = _mm256_blend_ps(first, second, 1);
    458       Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
    459       Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
    460       first = _mm256_blend_ps(tmp1, tmp2, 0x88);
    461     }
    462     else if (Offset==2)
    463     {
    464       first = _mm256_blend_ps(first, second, 3);
    465       Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
    466       Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
    467       first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
    468     }
    469     else if (Offset==3)
    470     {
    471       first = _mm256_blend_ps(first, second, 7);
    472       Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
    473       Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
    474       first = _mm256_blend_ps(tmp1, tmp2, 0xee);
    475     }
    476     else if (Offset==4)
    477     {
    478       first = _mm256_blend_ps(first, second, 15);
    479       Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
    480       Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
    481       first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
    482     }
    483     else if (Offset==5)
    484     {
    485       first = _mm256_blend_ps(first, second, 31);
    486       first = _mm256_permute2f128_ps(first, first, 1);
    487       Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
    488       first = _mm256_permute2f128_ps(tmp, tmp, 1);
    489       first = _mm256_blend_ps(tmp, first, 0x88);
    490     }
    491     else if (Offset==6)
    492     {
    493       first = _mm256_blend_ps(first, second, 63);
    494       first = _mm256_permute2f128_ps(first, first, 1);
    495       Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
    496       first = _mm256_permute2f128_ps(tmp, tmp, 1);
    497       first = _mm256_blend_ps(tmp, first, 0xcc);
    498     }
    499     else if (Offset==7)
    500     {
    501       first = _mm256_blend_ps(first, second, 127);
    502       first = _mm256_permute2f128_ps(first, first, 1);
    503       Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
    504       first = _mm256_permute2f128_ps(tmp, tmp, 1);
    505       first = _mm256_blend_ps(tmp, first, 0xee);
    506     }
    507   }
    508 };
    509 
    510 template<int Offset>
    511 struct palign_impl<Offset,Packet4d>
    512 {
    513   static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
    514   {
    515     if (Offset==1)
    516     {
    517       first = _mm256_blend_pd(first, second, 1);
    518       __m256d tmp = _mm256_permute_pd(first, 5);
    519       first = _mm256_permute2f128_pd(tmp, tmp, 1);
    520       first = _mm256_blend_pd(tmp, first, 0xA);
    521     }
    522     else if (Offset==2)
    523     {
    524       first = _mm256_blend_pd(first, second, 3);
    525       first = _mm256_permute2f128_pd(first, first, 1);
    526     }
    527     else if (Offset==3)
    528     {
    529       first = _mm256_blend_pd(first, second, 7);
    530       __m256d tmp = _mm256_permute_pd(first, 5);
    531       first = _mm256_permute2f128_pd(tmp, tmp, 1);
    532       first = _mm256_blend_pd(tmp, first, 5);
    533     }
    534   }
    535 };
    536 
    537 EIGEN_DEVICE_FUNC inline void
    538 ptranspose(PacketBlock<Packet8f,8>& kernel) {
    539   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
    540   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
    541   __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
    542   __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
    543   __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
    544   __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
    545   __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
    546   __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
    547   __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
    548   __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
    549   __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
    550   __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
    551   __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
    552   __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
    553   __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
    554   __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
    555   kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
    556   kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
    557   kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
    558   kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
    559   kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
    560   kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
    561   kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
    562   kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
    563 }
    564 
    565 EIGEN_DEVICE_FUNC inline void
    566 ptranspose(PacketBlock<Packet8f,4>& kernel) {
    567   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
    568   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
    569   __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
    570   __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
    571 
    572   __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
    573   __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
    574   __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
    575   __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
    576 
    577   kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
    578   kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
    579   kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
    580   kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
    581 }
    582 
    583 EIGEN_DEVICE_FUNC inline void
    584 ptranspose(PacketBlock<Packet4d,4>& kernel) {
    585   __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
    586   __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
    587   __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
    588   __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
    589 
    590   kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
    591   kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
    592   kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
    593   kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
    594 }
    595 
    596 template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
    597   const __m256 zero = _mm256_setzero_ps();
    598   const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
    599   __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
    600   return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
    601 }
    602 template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
    603   const __m256d zero = _mm256_setzero_pd();
    604   const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
    605   __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
    606   return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
    607 }
    608 
    609 template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
    610 {
    611   return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
    612 }
    613 
    614 template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
    615 {
    616   return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
    617 }
    618 
    619 template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
    620 {
    621   return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
    622 }
    623 
    624 template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
    625 {
    626   return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
    627 }
    628 
    629 } // end namespace internal
    630 
    631 } // end namespace Eigen
    632 
    633 #endif // EIGEN_PACKET_MATH_AVX_H
    634