Home | History | Annotate | Download | only in Tensor
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (at) gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
     12 
     13 namespace Eigen {
     14 namespace internal {
     15 
     16 
     17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
     18 // Full reducers for GPU, don't vectorize for now
     19 
     20 // Reducer function that enables multiple cuda thread to safely accumulate at the same
     21 // output address. It basically reads the current value of the output variable, and
     22 // attempts to update it with the new value. If in the meantime another cuda thread
     23 // updated the content of the output address it will try again.
     24 template <typename T, typename R>
     25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
     26 #if __CUDA_ARCH__ >= 300
     27   if (sizeof(T) == 4)
     28   {
     29     unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
     30     unsigned int newval = oldval;
     31     reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     32     if (newval == oldval) {
     33       return;
     34     }
     35     unsigned int readback;
     36     while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
     37       oldval = readback;
     38       newval = oldval;
     39       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     40       if (newval == oldval) {
     41         return;
     42       }
     43     }
     44   }
     45   else if (sizeof(T) == 8) {
     46     unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
     47     unsigned long long newval = oldval;
     48     reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     49     if (newval == oldval) {
     50       return;
     51     }
     52     unsigned long long readback;
     53     while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
     54       oldval = readback;
     55       newval = oldval;
     56       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
     57       if (newval == oldval) {
     58         return;
     59       }
     60     }
     61   }
     62   else {
     63     assert(0 && "Wordsize not supported");
     64   }
     65 #else
     66   assert(0 && "Shouldn't be called on unsupported device");
     67 #endif
     68 }
     69 
     70 // We extend atomicExch to support extra data types
     71 template <typename Type>
     72 __device__ inline Type atomicExchCustom(Type* address, Type val) {
     73   return atomicExch(address, val);
     74 }
     75 
     76 template <>
     77 __device__ inline double atomicExchCustom(double* address, double val) {
     78   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
     79   return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
     80 }
     81 
     82 #ifdef EIGEN_HAS_CUDA_FP16
     83 template <template <typename T> class R>
     84 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
     85   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
     86   unsigned int newval = oldval;
     87   reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
     88   if (newval == oldval) {
     89     return;
     90   }
     91   unsigned int readback;
     92   while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
     93     oldval = readback;
     94     newval = oldval;
     95     reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
     96     if (newval == oldval) {
     97       return;
     98     }
     99   }
    100 }
    101 #endif
    102 
    103 template <>
    104 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
    105 #if __CUDA_ARCH__ >= 300
    106   atomicAdd(output, accum);
    107 #else
    108   assert(0 && "Shouldn't be called on unsupported device");
    109 #endif
    110 }
    111 
    112 
    113 template <typename CoeffType, typename Index>
    114 __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
    115   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    116   const Index num_threads = blockDim.x * gridDim.x;
    117   for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
    118     output[i] = val;
    119   }
    120 }
    121 
    122 
    123 template <int BlockSize, int NumPerThread, typename Self,
    124           typename Reducer, typename Index>
    125 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
    126                                     typename Self::CoeffReturnType* output, unsigned int* semaphore) {
    127 #if __CUDA_ARCH__ >= 300
    128   // Initialize the output value
    129   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
    130   if (gridDim.x == 1) {
    131     if (first_index == 0) {
    132       *output = reducer.initialize();
    133     }
    134   }
    135   else {
    136     if (threadIdx.x == 0) {
    137       unsigned int block = atomicCAS(semaphore, 0u, 1u);
    138       if (block == 0) {
    139         // We're the first block to run, initialize the output value
    140         atomicExchCustom(output, reducer.initialize());
    141         __threadfence();
    142         atomicExch(semaphore, 2u);
    143       }
    144       else {
    145         // Wait for the first block to initialize the output value.
    146         // Use atomicCAS here to ensure that the reads aren't cached
    147         unsigned int val;
    148         do {
    149           val = atomicCAS(semaphore, 2u, 2u);
    150         }
    151         while (val < 2u);
    152       }
    153     }
    154   }
    155 
    156   __syncthreads();
    157 
    158   eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
    159 
    160   typename Self::CoeffReturnType accum = reducer.initialize();
    161   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
    162   for (Index i = 0; i < max_iter; i+=BlockSize) {
    163     const Index index = first_index + i;
    164     eigen_assert(index < num_coeffs);
    165     typename Self::CoeffReturnType val = input.m_impl.coeff(index);
    166     reducer.reduce(val, &accum);
    167   }
    168 
    169 #pragma unroll
    170   for (int offset = warpSize/2; offset > 0; offset /= 2) {
    171     reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
    172   }
    173 
    174   if ((threadIdx.x & (warpSize - 1)) == 0) {
    175     atomicReduce(output, accum, reducer);
    176   }
    177 
    178   if (gridDim.x > 1 && threadIdx.x == 0) {
    179     // Let the last block reset the semaphore
    180     atomicInc(semaphore, gridDim.x + 1);
    181   }
    182 #else
    183   assert(0 && "Shouldn't be called on unsupported device");
    184 #endif
    185 }
    186 
    187 
    188 #ifdef EIGEN_HAS_CUDA_FP16
    189 template <typename Self,
    190           typename Reducer, typename Index>
    191 __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
    192   eigen_assert(blockDim.x == 1);
    193   eigen_assert(gridDim.x == 1);
    194   if (num_coeffs % 2 != 0) {
    195     half last = input.m_impl.coeff(num_coeffs-1);
    196     *scratch = __halves2half2(last, reducer.initialize());
    197   } else {
    198     *scratch = reducer.template initializePacket<half2>();
    199   }
    200 }
    201 
    202 template <typename Self,
    203           typename Reducer, typename Index>
    204 __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
    205   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    206   const Index num_threads = blockDim.x * gridDim.x;
    207   const Index num_packets = num_coeffs / 2;
    208   for (Index i = thread_id; i < num_packets; i += num_threads) {
    209     ((half2*)output)[i] = reducer.template initializePacket<half2>();
    210   }
    211 
    212   if (thread_id == 0 && num_coeffs % 2 != 0) {
    213     output[num_coeffs-1] = reducer.initialize();
    214   }
    215 }
    216 
    217 template <int BlockSize, int NumPerThread, typename Self,
    218           typename Reducer, typename Index>
    219 __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
    220                                     half* output, half2* scratch) {
    221   eigen_assert(NumPerThread % 2 == 0);
    222 
    223   const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
    224 
    225   // Initialize the output value if it wasn't initialized by the ReductionInitKernel
    226   if (gridDim.x == 1 && first_index == 0) {
    227     if (num_coeffs % 2 != 0) {
    228       half last = input.m_impl.coeff(num_coeffs-1);
    229       *scratch = __halves2half2(last, reducer.initialize());
    230     } else {
    231       *scratch = reducer.template initializePacket<half2>();
    232     }
    233     __syncthreads();
    234   }
    235 
    236   half2 accum = reducer.template initializePacket<half2>();
    237   const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
    238   for (Index i = 0; i < max_iter; i += BlockSize) {
    239     const Index index = first_index + 2*i;
    240     eigen_assert(index + 1 < num_coeffs);
    241     half2 val = input.m_impl.template packet<Unaligned>(index);
    242     reducer.reducePacket(val, &accum);
    243   }
    244 
    245 #pragma unroll
    246   for (int offset = warpSize/2; offset > 0; offset /= 2) {
    247     reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
    248   }
    249 
    250   if ((threadIdx.x & (warpSize - 1)) == 0) {
    251     atomicReduce(scratch, accum, reducer);
    252   }
    253 
    254   __syncthreads();
    255 
    256   if (gridDim.x == 1 && first_index == 0) {
    257     half tmp = __low2half(*scratch);
    258     reducer.reduce(__high2half(*scratch), &tmp);
    259     *output = tmp;
    260   }
    261 }
    262 
    263 template <typename Op>
    264 __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
    265   eigen_assert(threadIdx.x == 1);
    266   half tmp = __low2half(*scratch);
    267   reducer.reduce(__high2half(*scratch), &tmp);
    268   *output = tmp;
    269 }
    270 
    271 #endif
    272 
    273 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
    274 struct FullReductionLauncher {
    275   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
    276     assert(false && "Should only be called on doubles, floats and half floats");
    277   }
    278 };
    279 
    280 // Specialization for float and double
    281 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
    282 struct FullReductionLauncher<
    283     Self, Op, OutputType, PacketAccess,
    284     typename internal::enable_if<
    285       internal::is_same<float, OutputType>::value ||
    286       internal::is_same<double, OutputType>::value,
    287     void>::type> {
    288   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
    289     typedef typename Self::Index Index;
    290     typedef typename Self::CoeffReturnType Scalar;
    291     const int block_size = 256;
    292     const int num_per_thread = 128;
    293     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    294 
    295     unsigned int* semaphore = NULL;
    296     if (num_blocks > 1) {
    297       semaphore = device.semaphore();
    298     }
    299 
    300     LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
    301                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
    302   }
    303 };
    304 
    305 #ifdef EIGEN_HAS_CUDA_FP16
    306 template <typename Self, typename Op>
    307 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
    308   static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
    309     assert(false && "Should not be called since there is no packet accessor");
    310   }
    311 };
    312 
    313 template <typename Self, typename Op>
    314 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
    315   static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
    316     typedef typename Self::Index Index;
    317 
    318     const int block_size = 256;
    319     const int num_per_thread = 128;
    320     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    321     half2* scratch = static_cast<half2*>(device.scratchpad());
    322 
    323     if (num_blocks > 1) {
    324       // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
    325       // won't be a race conditions between multiple thread blocks.
    326       LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
    327                          1, 1, 0, device, reducer, self, num_coeffs, scratch);
    328     }
    329 
    330     LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
    331                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
    332 
    333     if (num_blocks > 1) {
    334       LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
    335                          1, 1, 0, device, reducer, output, scratch);
    336     }
    337   }
    338 };
    339 #endif
    340 
    341 
    342 template <typename Self, typename Op, bool Vectorizable>
    343 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
    344   // Unfortunately nvidia doesn't support well exotic types such as complex,
    345   // so reduce the scope of the optimized version of the code to the simple cases
    346   // of doubles, floats and half floats
    347 #ifdef EIGEN_HAS_CUDA_FP16
    348   static const bool HasOptimizedImplementation = !Op::IsStateful &&
    349       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    350        internal::is_same<typename Self::CoeffReturnType, double>::value ||
    351        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
    352 #else
    353   static const bool HasOptimizedImplementation = !Op::IsStateful &&
    354                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    355                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
    356 #endif
    357 
    358   template <typename OutputType>
    359   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
    360     assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
    361     const Index num_coeffs = array_prod(self.m_impl.dimensions());
    362     // Don't crash when we're called with an input tensor of size 0.
    363     if (num_coeffs == 0) {
    364       return;
    365     }
    366 
    367     FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
    368   }
    369 };
    370 
    371 
    372 template <int NumPerThread, typename Self,
    373           typename Reducer, typename Index>
    374 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
    375                                          typename Self::CoeffReturnType* output) {
    376 #if __CUDA_ARCH__ >= 300
    377   typedef typename Self::CoeffReturnType Type;
    378   eigen_assert(blockDim.y == 1);
    379   eigen_assert(blockDim.z == 1);
    380   eigen_assert(gridDim.y == 1);
    381   eigen_assert(gridDim.z == 1);
    382 
    383   const int unroll_times = 16;
    384   eigen_assert(NumPerThread % unroll_times == 0);
    385 
    386   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
    387   const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
    388 
    389   const Index num_threads = blockDim.x * gridDim.x;
    390   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    391 
    392   // Initialize the output values if they weren't initialized by the ReductionInitKernel
    393   if (gridDim.x == 1) {
    394     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
    395       output[i] = reducer.initialize();
    396     }
    397     __syncthreads();
    398   }
    399 
    400   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
    401     const Index row = i / input_col_blocks;
    402 
    403     if (row < num_preserved_coeffs) {
    404       const Index col_block = i % input_col_blocks;
    405       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
    406 
    407       Type reduced_val = reducer.initialize();
    408 
    409       for (Index j = 0; j < NumPerThread; j += unroll_times) {
    410         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
    411         if (last_col >= num_coeffs_to_reduce) {
    412           for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
    413             const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
    414             reducer.reduce(val, &reduced_val);
    415           }
    416           break;
    417         } else {
    418           // Faster version of the loop with no branches after unrolling.
    419 #pragma unroll
    420           for (int k = 0; k < unroll_times; ++k) {
    421             const Index col = col_begin + blockDim.x * (j + k);
    422             reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
    423           }
    424         }
    425       }
    426 
    427 #pragma unroll
    428       for (int offset = warpSize/2; offset > 0; offset /= 2) {
    429         reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
    430       }
    431 
    432       if ((threadIdx.x & (warpSize - 1)) == 0) {
    433         atomicReduce(&(output[row]), reduced_val, reducer);
    434       }
    435     }
    436   }
    437 #else
    438   assert(0 && "Shouldn't be called on unsupported device");
    439 #endif
    440 }
    441 
    442 #ifdef EIGEN_HAS_CUDA_FP16
    443 
    444 template <int NumPerThread, typename Self,
    445           typename Reducer, typename Index>
    446 __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
    447                                               half* output) {
    448   eigen_assert(blockDim.y == 1);
    449   eigen_assert(blockDim.z == 1);
    450   eigen_assert(gridDim.y == 1);
    451   eigen_assert(gridDim.z == 1);
    452 
    453   const int unroll_times = 16;
    454   eigen_assert(NumPerThread % unroll_times == 0);
    455   eigen_assert(unroll_times % 2 == 0);
    456 
    457   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
    458   const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
    459 
    460   const Index num_threads = blockDim.x * gridDim.x;
    461   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    462 
    463   // Initialize the output values if they weren't initialized by the ReductionInitKernel
    464   if (gridDim.x == 1) {
    465     Index i = 2*thread_id;
    466     for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
    467       half* loc = output + i;
    468       *((half2*)loc) = reducer.template initializePacket<half2>();
    469     }
    470     if (i < num_preserved_coeffs) {
    471       output[i] = reducer.initialize();
    472     }
    473     __syncthreads();
    474   }
    475 
    476   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
    477     const Index row = 2 * (i / input_col_blocks);
    478 
    479     if (row + 1 < num_preserved_coeffs) {
    480       const Index col_block = i % input_col_blocks;
    481       const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
    482 
    483       half2 reduced_val1 = reducer.template initializePacket<half2>();
    484       half2 reduced_val2 = reducer.template initializePacket<half2>();
    485 
    486       for (Index j = 0; j < NumPerThread; j += unroll_times) {
    487         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
    488         if (last_col >= num_coeffs_to_reduce) {
    489           Index col = col_begin + blockDim.x * j;
    490           for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
    491             const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
    492             reducer.reducePacket(val1, &reduced_val1);
    493             const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
    494             reducer.reducePacket(val2, &reduced_val2);
    495           }
    496           if (col < num_coeffs_to_reduce) {
    497             // Peel;
    498             const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
    499             const half2 val1 = __halves2half2(last1, reducer.initialize());
    500             reducer.reducePacket(val1, &reduced_val1);
    501             const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
    502             const half2 val2 = __halves2half2(last2, reducer.initialize());
    503             reducer.reducePacket(val2, &reduced_val2);
    504           }
    505           break;
    506         } else {
    507           // Faster version of the loop with no branches after unrolling.
    508 #pragma unroll
    509           for (int k = 0; k < unroll_times; ++k) {
    510             const Index col = col_begin + blockDim.x * (j + k) * 2;
    511             reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
    512             reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
    513           }
    514         }
    515       }
    516 
    517 #pragma unroll
    518       for (int offset = warpSize/2; offset > 0; offset /= 2) {
    519         reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
    520         reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
    521       }
    522 
    523       half val1 =  __low2half(reduced_val1);
    524       reducer.reduce(__high2half(reduced_val1), &val1);
    525       half val2 =  __low2half(reduced_val2);
    526       reducer.reduce(__high2half(reduced_val2), &val2);
    527       half2 val = __halves2half2(val1, val2);
    528 
    529       if ((threadIdx.x & (warpSize - 1)) == 0) {
    530         half* loc = output + row;
    531         atomicReduce((half2*)loc, val, reducer);
    532       }
    533     }
    534   }
    535 }
    536 
    537 #endif
    538 
    539 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
    540 struct InnerReductionLauncher {
    541   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
    542     assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
    543     return true;
    544   }
    545 };
    546 
    547 // Specialization for float and double
    548 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
    549 struct InnerReductionLauncher<
    550   Self, Op, OutputType, PacketAccess,
    551   typename internal::enable_if<
    552     internal::is_same<float, OutputType>::value ||
    553     internal::is_same<double, OutputType>::value,
    554   void>::type> {
    555   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    556     typedef typename Self::Index Index;
    557 
    558     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
    559     const int block_size = 256;
    560     const int num_per_thread = 128;
    561     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    562     const int max_blocks = device.getNumCudaMultiProcessors() *
    563                            device.maxCudaThreadsPerMultiProcessor() / block_size;
    564     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    565 
    566     if (num_blocks > 1) {
    567       // We initialize the outputs outside the reduction kernel when we can't be sure that there
    568       // won't be a race conditions between multiple thread blocks.
    569       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
    570       const int max_blocks = device.getNumCudaMultiProcessors() *
    571                            device.maxCudaThreadsPerMultiProcessor() / 1024;
    572       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    573       LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
    574                          num_blocks, 1024, 0, device, reducer.initialize(),
    575                          num_preserved_vals, output);
    576     }
    577 
    578     LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
    579                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
    580 
    581     return false;
    582   }
    583 };
    584 
    585 #ifdef EIGEN_HAS_CUDA_FP16
    586 template <typename Self, typename Op>
    587 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
    588   static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
    589     assert(false && "Should not be called since there is no packet accessor");
    590     return true;
    591   }
    592 };
    593 
    594 template <typename Self, typename Op>
    595 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
    596   static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    597     typedef typename Self::Index Index;
    598 
    599     if (num_preserved_vals % 2 != 0) {
    600       // Not supported yet, revert to the slower code path
    601       return true;
    602     }
    603 
    604     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
    605     const int block_size = /*256*/128;
    606     const int num_per_thread = /*128*/64;
    607     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    608     const int max_blocks = device.getNumCudaMultiProcessors() *
    609                            device.maxCudaThreadsPerMultiProcessor() / block_size;
    610     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    611 
    612     if (num_blocks > 1) {
    613       // We initialize the outputs outside the reduction kernel when we can't be sure that there
    614       // won't be a race conditions between multiple thread blocks.
    615       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
    616       const int max_blocks = device.getNumCudaMultiProcessors() *
    617                            device.maxCudaThreadsPerMultiProcessor() / 1024;
    618       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    619       LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
    620                          1, 1, 0, device, reducer, self, num_preserved_vals, output);
    621     }
    622 
    623     LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
    624                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
    625 
    626     return false;
    627   }
    628 };
    629 #endif
    630 
    631 
    632 template <typename Self, typename Op>
    633 struct InnerReducer<Self, Op, GpuDevice> {
    634   // Unfortunately nvidia doesn't support well exotic types such as complex,
    635   // so reduce the scope of the optimized version of the code to the simple case
    636   // of floats and half floats.
    637 #ifdef EIGEN_HAS_CUDA_FP16
    638   static const bool HasOptimizedImplementation = !Op::IsStateful &&
    639       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    640        internal::is_same<typename Self::CoeffReturnType, double>::value ||
    641        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
    642 #else
    643   static const bool HasOptimizedImplementation = !Op::IsStateful &&
    644                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    645                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
    646 #endif
    647 
    648   template <typename OutputType>
    649   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    650     assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
    651     const Index num_coeffs = array_prod(self.m_impl.dimensions());
    652     // Don't crash when we're called with an input tensor of size 0.
    653     if (num_coeffs == 0) {
    654       return true;
    655     }
    656     // It's faster to use the usual code.
    657     if (num_coeffs_to_reduce <= 128) {
    658       return true;
    659     }
    660 
    661     return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
    662   }
    663 };
    664 
    665 template <int NumPerThread, typename Self,
    666           typename Reducer, typename Index>
    667 __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
    668                                      typename Self::CoeffReturnType* output) {
    669   const Index num_threads = blockDim.x * gridDim.x;
    670   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    671   // Initialize the output values if they weren't initialized by the ReductionInitKernel
    672   if (gridDim.x == 1) {
    673     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
    674       output[i] = reducer.initialize();
    675     }
    676     __syncthreads();
    677   }
    678 
    679   // Do the reduction.
    680   const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
    681   for (Index i = thread_id; i < max_iter; i += num_threads) {
    682     const Index input_col = i % num_preserved_coeffs;
    683     const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
    684     typename Self::CoeffReturnType reduced_val = reducer.initialize();
    685     const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
    686     for (Index j = input_row; j < max_row; j++) {
    687       typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
    688       reducer.reduce(val, &reduced_val);
    689     }
    690     atomicReduce(&(output[input_col]), reduced_val, reducer);
    691   }
    692 }
    693 
    694 
    695 template <typename Self, typename Op>
    696 struct OuterReducer<Self, Op, GpuDevice> {
    697   // Unfortunately nvidia doesn't support well exotic types such as complex,
    698   // so reduce the scope of the optimized version of the code to the simple case
    699   // of floats.
    700   static const bool HasOptimizedImplementation = !Op::IsStateful &&
    701                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
    702                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
    703   template <typename Device, typename OutputType>
    704   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
    705     assert(false && "Should only be called to reduce doubles or floats on a gpu device");
    706     return true;
    707   }
    708 
    709   static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
    710     typedef typename Self::Index Index;
    711 
    712     // It's faster to use the usual code.
    713     if (num_coeffs_to_reduce <= 32) {
    714       return true;
    715     }
    716 
    717     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
    718     const int block_size = 256;
    719     const int num_per_thread = 16;
    720     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
    721     const int max_blocks = device.getNumCudaMultiProcessors() *
    722                            device.maxCudaThreadsPerMultiProcessor() / block_size;
    723     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    724 
    725     if (num_blocks > 1) {
    726       // We initialize the outputs in the reduction kernel itself when we don't have to worry
    727       // about race conditions between multiple thread blocks.
    728       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
    729       const int max_blocks = device.getNumCudaMultiProcessors() *
    730                              device.maxCudaThreadsPerMultiProcessor() / 1024;
    731       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
    732       LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
    733                          num_blocks, 1024, 0, device, reducer.initialize(),
    734                          num_preserved_vals, output);
    735     }
    736 
    737     LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
    738                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
    739 
    740     return false;
    741   }
    742 };
    743 
    744 #endif
    745 
    746 
    747 } // end namespace internal
    748 } // end namespace Eigen
    749 
    750 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
    751