Home | History | Annotate | Download | only in SSE
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud (at) inria.fr>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_COMPLEX_SSE_H
     11 #define EIGEN_COMPLEX_SSE_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 //---------- float ----------
     18 struct Packet2cf
     19 {
     20   EIGEN_STRONG_INLINE Packet2cf() {}
     21   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
     22   __m128  v;
     23 };
     24 
     25 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
     26 // to leverage AVX instructions.
     27 #ifndef EIGEN_VECTORIZE_AVX
     28 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     29 {
     30   typedef Packet2cf type;
     31   typedef Packet2cf half;
     32   enum {
     33     Vectorizable = 1,
     34     AlignedOnScalar = 1,
     35     size = 2,
     36     HasHalfPacket = 0,
     37 
     38     HasAdd    = 1,
     39     HasSub    = 1,
     40     HasMul    = 1,
     41     HasDiv    = 1,
     42     HasNegate = 1,
     43     HasAbs    = 0,
     44     HasAbs2   = 0,
     45     HasMin    = 0,
     46     HasMax    = 0,
     47     HasSetLinear = 0,
     48     HasBlend = 1
     49   };
     50 };
     51 #endif
     52 
     53 template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
     54 
     55 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
     56 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
     57 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
     58 {
     59   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
     60   return Packet2cf(_mm_xor_ps(a.v,mask));
     61 }
     62 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
     63 {
     64   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
     65   return Packet2cf(_mm_xor_ps(a.v,mask));
     66 }
     67 
     68 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
     69 {
     70   #ifdef EIGEN_VECTORIZE_SSE3
     71   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
     72                                  _mm_mul_ps(_mm_movehdup_ps(a.v),
     73                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
     74 //   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
     75 //                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
     76 //                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
     77   #else
     78   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
     79   return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
     80                               _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
     81                                                     vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
     82   #endif
     83 }
     84 
     85 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
     86 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
     87 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
     88 template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
     89 
     90 template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
     91 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
     92 
     93 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
     94 {
     95   Packet2cf res;
     96 #if EIGEN_GNUC_AT_MOST(4,2)
     97   // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
     98   res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
     99 #elif EIGEN_GNUC_AT_LEAST(4,6)
    100   // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
    101   #pragma GCC diagnostic push
    102   #pragma GCC diagnostic ignored "-Wuninitialized"
    103   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
    104   #pragma GCC diagnostic pop
    105 #else
    106   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
    107 #endif
    108   return Packet2cf(_mm_movelh_ps(res.v,res.v));
    109 }
    110 
    111 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
    112 
    113 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
    114 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
    115 
    116 
    117 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
    118 {
    119   return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
    120                               std::imag(from[0*stride]), std::real(from[0*stride])));
    121 }
    122 
    123 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
    124 {
    125   to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
    126                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
    127   to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
    128                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
    129 }
    130 
    131 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
    132 
    133 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
    134 {
    135   #if EIGEN_GNUC_AT_MOST(4,3)
    136   // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
    137   // This workaround also fix invalid code generation with gcc 4.3
    138   EIGEN_ALIGN16 std::complex<float> res[2];
    139   _mm_store_ps((float*)res, a.v);
    140   return res[0];
    141   #else
    142   std::complex<float> res;
    143   _mm_storel_pi((__m64*)&res, a.v);
    144   return res;
    145   #endif
    146 }
    147 
    148 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
    149 
    150 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
    151 {
    152   return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
    153 }
    154 
    155 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
    156 {
    157   return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
    158 }
    159 
    160 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
    161 {
    162   return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
    163 }
    164 
    165 template<int Offset>
    166 struct palign_impl<Offset,Packet2cf>
    167 {
    168   static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
    169   {
    170     if (Offset==1)
    171     {
    172       first.v = _mm_movehl_ps(first.v, first.v);
    173       first.v = _mm_movelh_ps(first.v, second.v);
    174     }
    175   }
    176 };
    177 
    178 template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
    179 {
    180   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
    181   { return padd(pmul(x,y),c); }
    182 
    183   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
    184   {
    185     #ifdef EIGEN_VECTORIZE_SSE3
    186     return internal::pmul(a, pconj(b));
    187     #else
    188     const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
    189     return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
    190                                 _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
    191                                            vec4f_swizzle1(b.v, 1, 0, 3, 2))));
    192     #endif
    193   }
    194 };
    195 
    196 template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
    197 {
    198   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
    199   { return padd(pmul(x,y),c); }
    200 
    201   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
    202   {
    203     #ifdef EIGEN_VECTORIZE_SSE3
    204     return internal::pmul(pconj(a), b);
    205     #else
    206     const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
    207     return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
    208                                 _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
    209                                                       vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
    210     #endif
    211   }
    212 };
    213 
    214 template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
    215 {
    216   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
    217   { return padd(pmul(x,y),c); }
    218 
    219   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
    220   {
    221     #ifdef EIGEN_VECTORIZE_SSE3
    222     return pconj(internal::pmul(a, b));
    223     #else
    224     const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
    225     return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
    226                                 _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
    227                                            vec4f_swizzle1(b.v, 1, 0, 3, 2))));
    228     #endif
    229   }
    230 };
    231 
    232 template<> struct conj_helper<Packet4f, Packet2cf, false,false>
    233 {
    234   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
    235   { return padd(c, pmul(x,y)); }
    236 
    237   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
    238   { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
    239 };
    240 
    241 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
    242 {
    243   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
    244   { return padd(c, pmul(x,y)); }
    245 
    246   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
    247   { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
    248 };
    249 
    250 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
    251 {
    252   // TODO optimize it for SSE3 and 4
    253   Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
    254   __m128 s = _mm_mul_ps(b.v,b.v);
    255   return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
    256 }
    257 
    258 EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
    259 {
    260   return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
    261 }
    262 
    263 
    264 //---------- double ----------
    265 struct Packet1cd
    266 {
    267   EIGEN_STRONG_INLINE Packet1cd() {}
    268   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
    269   __m128d  v;
    270 };
    271 
    272 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
    273 // to leverage AVX instructions.
    274 #ifndef EIGEN_VECTORIZE_AVX
    275 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    276 {
    277   typedef Packet1cd type;
    278   typedef Packet1cd half;
    279   enum {
    280     Vectorizable = 1,
    281     AlignedOnScalar = 0,
    282     size = 1,
    283     HasHalfPacket = 0,
    284 
    285     HasAdd    = 1,
    286     HasSub    = 1,
    287     HasMul    = 1,
    288     HasDiv    = 1,
    289     HasNegate = 1,
    290     HasAbs    = 0,
    291     HasAbs2   = 0,
    292     HasMin    = 0,
    293     HasMax    = 0,
    294     HasSetLinear = 0
    295   };
    296 };
    297 #endif
    298 
    299 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
    300 
    301 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
    302 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
    303 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
    304 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
    305 {
    306   const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
    307   return Packet1cd(_mm_xor_pd(a.v,mask));
    308 }
    309 
    310 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
    311 {
    312   #ifdef EIGEN_VECTORIZE_SSE3
    313   return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
    314                                  _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
    315                                             vec2d_swizzle1(b.v, 1, 0))));
    316   #else
    317   const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
    318   return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
    319                               _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
    320                                                     vec2d_swizzle1(b.v, 1, 0)), mask)));
    321   #endif
    322 }
    323 
    324 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
    325 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
    326 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
    327 template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
    328 
    329 // FIXME force unaligned load, this is a temporary fix
    330 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
    331 { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
    332 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
    333 { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
    334 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
    335 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
    336 
    337 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
    338 
    339 // FIXME force unaligned store, this is a temporary fix
    340 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
    341 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
    342 
    343 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
    344 
    345 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
    346 {
    347   EIGEN_ALIGN16 double res[2];
    348   _mm_store_pd(res, a.v);
    349   return std::complex<double>(res[0],res[1]);
    350 }
    351 
    352 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
    353 
    354 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
    355 {
    356   return pfirst(a);
    357 }
    358 
    359 template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
    360 {
    361   return vecs[0];
    362 }
    363 
    364 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
    365 {
    366   return pfirst(a);
    367 }
    368 
    369 template<int Offset>
    370 struct palign_impl<Offset,Packet1cd>
    371 {
    372   static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
    373   {
    374     // FIXME is it sure we never have to align a Packet1cd?
    375     // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
    376   }
    377 };
    378 
    379 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
    380 {
    381   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
    382   { return padd(pmul(x,y),c); }
    383 
    384   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
    385   {
    386     #ifdef EIGEN_VECTORIZE_SSE3
    387     return internal::pmul(a, pconj(b));
    388     #else
    389     const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
    390     return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
    391                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
    392                                            vec2d_swizzle1(b.v, 1, 0))));
    393     #endif
    394   }
    395 };
    396 
    397 template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
    398 {
    399   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
    400   { return padd(pmul(x,y),c); }
    401 
    402   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
    403   {
    404     #ifdef EIGEN_VECTORIZE_SSE3
    405     return internal::pmul(pconj(a), b);
    406     #else
    407     const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
    408     return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
    409                                 _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
    410                                                       vec2d_swizzle1(b.v, 1, 0)), mask)));
    411     #endif
    412   }
    413 };
    414 
    415 template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
    416 {
    417   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
    418   { return padd(pmul(x,y),c); }
    419 
    420   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
    421   {
    422     #ifdef EIGEN_VECTORIZE_SSE3
    423     return pconj(internal::pmul(a, b));
    424     #else
    425     const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
    426     return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
    427                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
    428                                            vec2d_swizzle1(b.v, 1, 0))));
    429     #endif
    430   }
    431 };
    432 
    433 template<> struct conj_helper<Packet2d, Packet1cd, false,false>
    434 {
    435   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
    436   { return padd(c, pmul(x,y)); }
    437 
    438   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
    439   { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
    440 };
    441 
    442 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
    443 {
    444   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
    445   { return padd(c, pmul(x,y)); }
    446 
    447   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
    448   { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
    449 };
    450 
    451 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
    452 {
    453   // TODO optimize it for SSE3 and 4
    454   Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
    455   __m128d s = _mm_mul_pd(b.v,b.v);
    456   return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
    457 }
    458 
    459 EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
    460 {
    461   return Packet1cd(preverse(Packet2d(x.v)));
    462 }
    463 
    464 EIGEN_DEVICE_FUNC inline void
    465 ptranspose(PacketBlock<Packet2cf,2>& kernel) {
    466   __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
    467   __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
    468 
    469   __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
    470   kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
    471   kernel.packet[1].v = tmp;
    472 }
    473 
    474 template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
    475   __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
    476   return Packet2cf(_mm_castpd_ps(result));
    477 }
    478 
    479 template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b)
    480 {
    481   return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b)));
    482 }
    483 
    484 template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b)
    485 {
    486   return pset1<Packet1cd>(b);
    487 }
    488 
    489 template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b)
    490 {
    491   return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b)));
    492 }
    493 
    494 template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b)
    495 {
    496   return pset1<Packet1cd>(b);
    497 }
    498 
    499 } // end namespace internal
    500 
    501 } // end namespace Eigen
    502 
    503 #endif // EIGEN_COMPLEX_SSE_H
    504