Home | History | Annotate | Download | only in Tensor

Lines Matching refs:num_coeffs

125 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
161 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
164 eigen_assert(index < num_coeffs);
191 __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
194 if (num_coeffs % 2 != 0) {
195 half last = input.m_impl.coeff(num_coeffs-1);
204 __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
207 const Index num_packets = num_coeffs / 2;
212 if (thread_id == 0 && num_coeffs % 2 != 0) {
213 output[num_coeffs-1] = reducer.initialize();
219 __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
227 if (num_coeffs % 2 != 0) {
228 half last = input.m_impl.coeff(num_coeffs-1);
237 const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
240 eigen_assert(index + 1 < num_coeffs);
288 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
293 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
301 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
315 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
320 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
327 1, 1, 0, device, reducer, self, num_coeffs, scratch);
331 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
361 const Index num_coeffs = array_prod(self.m_impl.dimensions());
363 if (num_coeffs == 0) {
367 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
558 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
561 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
604 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
607 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
651 const Index num_coeffs = array_prod(self.m_impl.dimensions());
653 if (num_coeffs == 0) {
717 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
720 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);