Cross Reference: /external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h

Lines Matching refs:num_coeffs
125 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
161   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
164     eigen_assert(index < num_coeffs);
191 __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
194   if (num_coeffs % 2 != 0) {
195     half last = input.m_impl.coeff(num_coeffs-1);
204 __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
207   const Index num_packets = num_coeffs / 2;
212   if (thread_id == 0 && num_coeffs % 2 != 0) {
213     output[num_coeffs-1] = reducer.initialize();
219 __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
227     if (num_coeffs % 2 != 0) {
228       half last = input.m_impl.coeff(num_coeffs-1);
237   const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
240     eigen_assert(index + 1 < num_coeffs);
288   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
293     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
301                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
315   static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
320     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
327                          1, 1, 0, device, reducer, self, num_coeffs, scratch);
331                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
361     const Index num_coeffs = array_prod(self.m_impl.dimensions());
363     if (num_coeffs == 0) {
367     FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
558     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
561     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
604     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
607     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
651     const Index num_coeffs = array_prod(self.m_impl.dimensions());
653     if (num_coeffs == 0) {
717     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
720     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
OpenGrok