Home | History | Annotate | Download | only in CUDA
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (at) gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_PACKET_MATH_CUDA_H
     11 #define EIGEN_PACKET_MATH_CUDA_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 // Make sure this is only available when targeting a GPU: we don't want to
     18 // introduce conflicts between these packet_traits definitions and the ones
     19 // we'll use on the host side (SSE, AVX, ...)
     20 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
     21 template<> struct is_arithmetic<float4>  { enum { value = true }; };
     22 template<> struct is_arithmetic<double2> { enum { value = true }; };
     23 
     24 template<> struct packet_traits<float> : default_packet_traits
     25 {
     26   typedef float4 type;
     27   typedef float4 half;
     28   enum {
     29     Vectorizable = 1,
     30     AlignedOnScalar = 1,
     31     size=4,
     32     HasHalfPacket = 0,
     33 
     34     HasDiv  = 1,
     35     HasSin  = 0,
     36     HasCos  = 0,
     37     HasLog  = 1,
     38     HasExp  = 1,
     39     HasSqrt = 1,
     40     HasRsqrt = 1,
     41     HasLGamma = 1,
     42     HasDiGamma = 1,
     43     HasZeta = 1,
     44     HasPolygamma = 1,
     45     HasErf = 1,
     46     HasErfc = 1,
     47     HasIGamma = 1,
     48     HasIGammac = 1,
     49     HasBetaInc = 1,
     50 
     51     HasBlend = 0,
     52   };
     53 };
     54 
     55 template<> struct packet_traits<double> : default_packet_traits
     56 {
     57   typedef double2 type;
     58   typedef double2 half;
     59   enum {
     60     Vectorizable = 1,
     61     AlignedOnScalar = 1,
     62     size=2,
     63     HasHalfPacket = 0,
     64 
     65     HasDiv  = 1,
     66     HasLog  = 1,
     67     HasExp  = 1,
     68     HasSqrt = 1,
     69     HasRsqrt = 1,
     70     HasLGamma = 1,
     71     HasDiGamma = 1,
     72     HasZeta = 1,
     73     HasPolygamma = 1,
     74     HasErf = 1,
     75     HasErfc = 1,
     76     HasIGamma = 1,
     77     HasIGammac = 1,
     78     HasBetaInc = 1,
     79 
     80     HasBlend = 0,
     81   };
     82 };
     83 
     84 
     85 template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
     86 template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
     87 
     88 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
     89   return make_float4(from, from, from, from);
     90 }
     91 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
     92   return make_double2(from, from);
     93 }
     94 
     95 
     96 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
     97   return make_float4(a, a+1, a+2, a+3);
     98 }
     99 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
    100   return make_double2(a, a+1);
    101 }
    102 
    103 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
    104   return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
    105 }
    106 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
    107   return make_double2(a.x+b.x, a.y+b.y);
    108 }
    109 
    110 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
    111   return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
    112 }
    113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
    114   return make_double2(a.x-b.x, a.y-b.y);
    115 }
    116 
    117 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
    118   return make_float4(-a.x, -a.y, -a.z, -a.w);
    119 }
    120 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
    121   return make_double2(-a.x, -a.y);
    122 }
    123 
    124 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
    125 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
    126 
    127 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
    128   return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
    129 }
    130 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
    131   return make_double2(a.x*b.x, a.y*b.y);
    132 }
    133 
    134 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
    135   return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
    136 }
    137 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
    138   return make_double2(a.x/b.x, a.y/b.y);
    139 }
    140 
    141 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
    142   return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
    143 }
    144 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
    145   return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
    146 }
    147 
    148 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
    149   return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
    150 }
    151 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
    152   return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
    153 }
    154 
    155 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
    156   return *reinterpret_cast<const float4*>(from);
    157 }
    158 
    159 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
    160   return *reinterpret_cast<const double2*>(from);
    161 }
    162 
    163 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
    164   return make_float4(from[0], from[1], from[2], from[3]);
    165 }
    166 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
    167   return make_double2(from[0], from[1]);
    168 }
    169 
    170 template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
    171   return make_float4(from[0], from[0], from[1], from[1]);
    172 }
    173 template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
    174   return make_double2(from[0], from[0]);
    175 }
    176 
    177 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
    178   *reinterpret_cast<float4*>(to) = from;
    179 }
    180 
    181 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
    182   *reinterpret_cast<double2*>(to) = from;
    183 }
    184 
    185 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
    186   to[0] = from.x;
    187   to[1] = from.y;
    188   to[2] = from.z;
    189   to[3] = from.w;
    190 }
    191 
    192 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
    193   to[0] = from.x;
    194   to[1] = from.y;
    195 }
    196 
    197 template<>
    198 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
    199 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
    200   return __ldg((const float4*)from);
    201 #else
    202   return make_float4(from[0], from[1], from[2], from[3]);
    203 #endif
    204 }
    205 template<>
    206 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
    207 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
    208   return __ldg((const double2*)from);
    209 #else
    210   return make_double2(from[0], from[1]);
    211 #endif
    212 }
    213 
    214 template<>
    215 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
    216 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
    217   return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
    218 #else
    219   return make_float4(from[0], from[1], from[2], from[3]);
    220 #endif
    221 }
    222 template<>
    223 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
    224 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
    225   return make_double2(__ldg(from+0), __ldg(from+1));
    226 #else
    227   return make_double2(from[0], from[1]);
    228 #endif
    229 }
    230 
    231 template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
    232   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
    233 }
    234 
    235 template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
    236   return make_double2(from[0*stride], from[1*stride]);
    237 }
    238 
    239 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
    240   to[stride*0] = from.x;
    241   to[stride*1] = from.y;
    242   to[stride*2] = from.z;
    243   to[stride*3] = from.w;
    244 }
    245 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
    246   to[stride*0] = from.x;
    247   to[stride*1] = from.y;
    248 }
    249 
    250 template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
    251   return a.x;
    252 }
    253 template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
    254   return a.x;
    255 }
    256 
    257 template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
    258   return a.x + a.y + a.z + a.w;
    259 }
    260 template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
    261   return a.x + a.y;
    262 }
    263 
    264 template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
    265   return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
    266 }
    267 template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
    268   return fmax(a.x, a.y);
    269 }
    270 
    271 template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
    272   return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
    273 }
    274 template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
    275   return fmin(a.x, a.y);
    276 }
    277 
    278 template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
    279   return a.x * a.y * a.z * a.w;
    280 }
    281 template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
    282   return a.x * a.y;
    283 }
    284 
    285 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
    286   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
    287 }
    288 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
    289   return make_double2(fabs(a.x), fabs(a.y));
    290 }
    291 
    292 EIGEN_DEVICE_FUNC inline void
    293 ptranspose(PacketBlock<float4,4>& kernel) {
    294   float tmp = kernel.packet[0].y;
    295   kernel.packet[0].y = kernel.packet[1].x;
    296   kernel.packet[1].x = tmp;
    297 
    298   tmp = kernel.packet[0].z;
    299   kernel.packet[0].z = kernel.packet[2].x;
    300   kernel.packet[2].x = tmp;
    301 
    302   tmp = kernel.packet[0].w;
    303   kernel.packet[0].w = kernel.packet[3].x;
    304   kernel.packet[3].x = tmp;
    305 
    306   tmp = kernel.packet[1].z;
    307   kernel.packet[1].z = kernel.packet[2].y;
    308   kernel.packet[2].y = tmp;
    309 
    310   tmp = kernel.packet[1].w;
    311   kernel.packet[1].w = kernel.packet[3].y;
    312   kernel.packet[3].y = tmp;
    313 
    314   tmp = kernel.packet[2].w;
    315   kernel.packet[2].w = kernel.packet[3].z;
    316   kernel.packet[3].z = tmp;
    317 }
    318 
    319 EIGEN_DEVICE_FUNC inline void
    320 ptranspose(PacketBlock<double2,2>& kernel) {
    321   double tmp = kernel.packet[0].y;
    322   kernel.packet[0].y = kernel.packet[1].x;
    323   kernel.packet[1].x = tmp;
    324 }
    325 
    326 #endif
    327 
    328 } // end namespace internal
    329 
    330 } // end namespace Eigen
    331 
    332 
    333 #endif // EIGEN_PACKET_MATH_CUDA_H
    334