1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (at) gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 12 13 namespace Eigen { 14 namespace internal { 15 16 17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 18 // Full reducers for GPU, don't vectorize for now 19 20 // Reducer function that enables multiple cuda thread to safely accumulate at the same 21 // output address. It basically reads the current value of the output variable, and 22 // attempts to update it with the new value. If in the meantime another cuda thread 23 // updated the content of the output address it will try again. 24 template <typename T, typename R> 25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { 26 #if __CUDA_ARCH__ >= 300 27 if (sizeof(T) == 4) 28 { 29 unsigned int oldval = *reinterpret_cast<unsigned int*>(output); 30 unsigned int newval = oldval; 31 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 32 if (newval == oldval) { 33 return; 34 } 35 unsigned int readback; 36 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { 37 oldval = readback; 38 newval = oldval; 39 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 40 if (newval == oldval) { 41 return; 42 } 43 } 44 } 45 else if (sizeof(T) == 8) { 46 unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output); 47 unsigned long long newval = oldval; 48 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 49 if (newval == oldval) { 50 return; 51 } 52 unsigned long long readback; 53 while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { 54 oldval = readback; 55 newval = oldval; 56 reducer.reduce(accum, reinterpret_cast<T*>(&newval)); 57 if (newval == oldval) { 58 return; 59 } 60 } 61 } 62 else { 63 assert(0 && "Wordsize not supported"); 64 } 65 #else 66 assert(0 && "Shouldn't be called on unsupported device"); 67 #endif 68 } 69 70 // We extend atomicExch to support extra data types 71 template <typename Type> 72 __device__ inline Type atomicExchCustom(Type* address, Type val) { 73 return atomicExch(address, val); 74 } 75 76 template <> 77 __device__ inline double atomicExchCustom(double* address, double val) { 78 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); 79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); 80 } 81 82 #ifdef EIGEN_HAS_CUDA_FP16 83 template <template <typename T> class R> 84 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { 85 unsigned int oldval = *reinterpret_cast<unsigned int*>(output); 86 unsigned int newval = oldval; 87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); 88 if (newval == oldval) { 89 return; 90 } 91 unsigned int readback; 92 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { 93 oldval = readback; 94 newval = oldval; 95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); 96 if (newval == oldval) { 97 return; 98 } 99 } 100 } 101 #endif 102 103 template <> 104 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { 105 #if __CUDA_ARCH__ >= 300 106 atomicAdd(output, accum); 107 #else 108 assert(0 && "Shouldn't be called on unsupported device"); 109 #endif 110 } 111 112 113 template <typename CoeffType, typename Index> 114 __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { 115 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 116 const Index num_threads = blockDim.x * gridDim.x; 117 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { 118 output[i] = val; 119 } 120 } 121 122 123 template <int BlockSize, int NumPerThread, typename Self, 124 typename Reducer, typename Index> 125 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, 126 typename Self::CoeffReturnType* output, unsigned int* semaphore) { 127 #if __CUDA_ARCH__ >= 300 128 // Initialize the output value 129 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; 130 if (gridDim.x == 1) { 131 if (first_index == 0) { 132 *output = reducer.initialize(); 133 } 134 } 135 else { 136 if (threadIdx.x == 0) { 137 unsigned int block = atomicCAS(semaphore, 0u, 1u); 138 if (block == 0) { 139 // We're the first block to run, initialize the output value 140 atomicExchCustom(output, reducer.initialize()); 141 __threadfence(); 142 atomicExch(semaphore, 2u); 143 } 144 else { 145 // Wait for the first block to initialize the output value. 146 // Use atomicCAS here to ensure that the reads aren't cached 147 unsigned int val; 148 do { 149 val = atomicCAS(semaphore, 2u, 2u); 150 } 151 while (val < 2u); 152 } 153 } 154 } 155 156 __syncthreads(); 157 158 eigen_assert(gridDim.x == 1 || *semaphore >= 2u); 159 160 typename Self::CoeffReturnType accum = reducer.initialize(); 161 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); 162 for (Index i = 0; i < max_iter; i+=BlockSize) { 163 const Index index = first_index + i; 164 eigen_assert(index < num_coeffs); 165 typename Self::CoeffReturnType val = input.m_impl.coeff(index); 166 reducer.reduce(val, &accum); 167 } 168 169 #pragma unroll 170 for (int offset = warpSize/2; offset > 0; offset /= 2) { 171 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); 172 } 173 174 if ((threadIdx.x & (warpSize - 1)) == 0) { 175 atomicReduce(output, accum, reducer); 176 } 177 178 if (gridDim.x > 1 && threadIdx.x == 0) { 179 // Let the last block reset the semaphore 180 atomicInc(semaphore, gridDim.x + 1); 181 } 182 #else 183 assert(0 && "Shouldn't be called on unsupported device"); 184 #endif 185 } 186 187 188 #ifdef EIGEN_HAS_CUDA_FP16 189 template <typename Self, 190 typename Reducer, typename Index> 191 __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { 192 eigen_assert(blockDim.x == 1); 193 eigen_assert(gridDim.x == 1); 194 if (num_coeffs % 2 != 0) { 195 half last = input.m_impl.coeff(num_coeffs-1); 196 *scratch = __halves2half2(last, reducer.initialize()); 197 } else { 198 *scratch = reducer.template initializePacket<half2>(); 199 } 200 } 201 202 template <typename Self, 203 typename Reducer, typename Index> 204 __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { 205 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 206 const Index num_threads = blockDim.x * gridDim.x; 207 const Index num_packets = num_coeffs / 2; 208 for (Index i = thread_id; i < num_packets; i += num_threads) { 209 ((half2*)output)[i] = reducer.template initializePacket<half2>(); 210 } 211 212 if (thread_id == 0 && num_coeffs % 2 != 0) { 213 output[num_coeffs-1] = reducer.initialize(); 214 } 215 } 216 217 template <int BlockSize, int NumPerThread, typename Self, 218 typename Reducer, typename Index> 219 __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, 220 half* output, half2* scratch) { 221 eigen_assert(NumPerThread % 2 == 0); 222 223 const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; 224 225 // Initialize the output value if it wasn't initialized by the ReductionInitKernel 226 if (gridDim.x == 1 && first_index == 0) { 227 if (num_coeffs % 2 != 0) { 228 half last = input.m_impl.coeff(num_coeffs-1); 229 *scratch = __halves2half2(last, reducer.initialize()); 230 } else { 231 *scratch = reducer.template initializePacket<half2>(); 232 } 233 __syncthreads(); 234 } 235 236 half2 accum = reducer.template initializePacket<half2>(); 237 const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); 238 for (Index i = 0; i < max_iter; i += BlockSize) { 239 const Index index = first_index + 2*i; 240 eigen_assert(index + 1 < num_coeffs); 241 half2 val = input.m_impl.template packet<Unaligned>(index); 242 reducer.reducePacket(val, &accum); 243 } 244 245 #pragma unroll 246 for (int offset = warpSize/2; offset > 0; offset /= 2) { 247 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); 248 } 249 250 if ((threadIdx.x & (warpSize - 1)) == 0) { 251 atomicReduce(scratch, accum, reducer); 252 } 253 254 __syncthreads(); 255 256 if (gridDim.x == 1 && first_index == 0) { 257 half tmp = __low2half(*scratch); 258 reducer.reduce(__high2half(*scratch), &tmp); 259 *output = tmp; 260 } 261 } 262 263 template <typename Op> 264 __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { 265 eigen_assert(threadIdx.x == 1); 266 half tmp = __low2half(*scratch); 267 reducer.reduce(__high2half(*scratch), &tmp); 268 *output = tmp; 269 } 270 271 #endif 272 273 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> 274 struct FullReductionLauncher { 275 static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { 276 assert(false && "Should only be called on doubles, floats and half floats"); 277 } 278 }; 279 280 // Specialization for float and double 281 template <typename Self, typename Op, typename OutputType, bool PacketAccess> 282 struct FullReductionLauncher< 283 Self, Op, OutputType, PacketAccess, 284 typename internal::enable_if< 285 internal::is_same<float, OutputType>::value || 286 internal::is_same<double, OutputType>::value, 287 void>::type> { 288 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { 289 typedef typename Self::Index Index; 290 typedef typename Self::CoeffReturnType Scalar; 291 const int block_size = 256; 292 const int num_per_thread = 128; 293 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 294 295 unsigned int* semaphore = NULL; 296 if (num_blocks > 1) { 297 semaphore = device.semaphore(); 298 } 299 300 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), 301 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); 302 } 303 }; 304 305 #ifdef EIGEN_HAS_CUDA_FP16 306 template <typename Self, typename Op> 307 struct FullReductionLauncher<Self, Op, Eigen::half, false> { 308 static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { 309 assert(false && "Should not be called since there is no packet accessor"); 310 } 311 }; 312 313 template <typename Self, typename Op> 314 struct FullReductionLauncher<Self, Op, Eigen::half, true> { 315 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { 316 typedef typename Self::Index Index; 317 318 const int block_size = 256; 319 const int num_per_thread = 128; 320 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 321 half2* scratch = static_cast<half2*>(device.scratchpad()); 322 323 if (num_blocks > 1) { 324 // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there 325 // won't be a race conditions between multiple thread blocks. 326 LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 327 1, 1, 0, device, reducer, self, num_coeffs, scratch); 328 } 329 330 LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), 331 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); 332 333 if (num_blocks > 1) { 334 LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 335 1, 1, 0, device, reducer, output, scratch); 336 } 337 } 338 }; 339 #endif 340 341 342 template <typename Self, typename Op, bool Vectorizable> 343 struct FullReducer<Self, Op, GpuDevice, Vectorizable> { 344 // Unfortunately nvidia doesn't support well exotic types such as complex, 345 // so reduce the scope of the optimized version of the code to the simple cases 346 // of doubles, floats and half floats 347 #ifdef EIGEN_HAS_CUDA_FP16 348 static const bool HasOptimizedImplementation = !Op::IsStateful && 349 (internal::is_same<typename Self::CoeffReturnType, float>::value || 350 internal::is_same<typename Self::CoeffReturnType, double>::value || 351 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); 352 #else 353 static const bool HasOptimizedImplementation = !Op::IsStateful && 354 (internal::is_same<typename Self::CoeffReturnType, float>::value || 355 internal::is_same<typename Self::CoeffReturnType, double>::value); 356 #endif 357 358 template <typename OutputType> 359 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { 360 assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); 361 const Index num_coeffs = array_prod(self.m_impl.dimensions()); 362 // Don't crash when we're called with an input tensor of size 0. 363 if (num_coeffs == 0) { 364 return; 365 } 366 367 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); 368 } 369 }; 370 371 372 template <int NumPerThread, typename Self, 373 typename Reducer, typename Index> 374 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, 375 typename Self::CoeffReturnType* output) { 376 #if __CUDA_ARCH__ >= 300 377 typedef typename Self::CoeffReturnType Type; 378 eigen_assert(blockDim.y == 1); 379 eigen_assert(blockDim.z == 1); 380 eigen_assert(gridDim.y == 1); 381 eigen_assert(gridDim.z == 1); 382 383 const int unroll_times = 16; 384 eigen_assert(NumPerThread % unroll_times == 0); 385 386 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); 387 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; 388 389 const Index num_threads = blockDim.x * gridDim.x; 390 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 391 392 // Initialize the output values if they weren't initialized by the ReductionInitKernel 393 if (gridDim.x == 1) { 394 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { 395 output[i] = reducer.initialize(); 396 } 397 __syncthreads(); 398 } 399 400 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { 401 const Index row = i / input_col_blocks; 402 403 if (row < num_preserved_coeffs) { 404 const Index col_block = i % input_col_blocks; 405 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; 406 407 Type reduced_val = reducer.initialize(); 408 409 for (Index j = 0; j < NumPerThread; j += unroll_times) { 410 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); 411 if (last_col >= num_coeffs_to_reduce) { 412 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { 413 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); 414 reducer.reduce(val, &reduced_val); 415 } 416 break; 417 } else { 418 // Faster version of the loop with no branches after unrolling. 419 #pragma unroll 420 for (int k = 0; k < unroll_times; ++k) { 421 const Index col = col_begin + blockDim.x * (j + k); 422 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); 423 } 424 } 425 } 426 427 #pragma unroll 428 for (int offset = warpSize/2; offset > 0; offset /= 2) { 429 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); 430 } 431 432 if ((threadIdx.x & (warpSize - 1)) == 0) { 433 atomicReduce(&(output[row]), reduced_val, reducer); 434 } 435 } 436 } 437 #else 438 assert(0 && "Shouldn't be called on unsupported device"); 439 #endif 440 } 441 442 #ifdef EIGEN_HAS_CUDA_FP16 443 444 template <int NumPerThread, typename Self, 445 typename Reducer, typename Index> 446 __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, 447 half* output) { 448 eigen_assert(blockDim.y == 1); 449 eigen_assert(blockDim.z == 1); 450 eigen_assert(gridDim.y == 1); 451 eigen_assert(gridDim.z == 1); 452 453 const int unroll_times = 16; 454 eigen_assert(NumPerThread % unroll_times == 0); 455 eigen_assert(unroll_times % 2 == 0); 456 457 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); 458 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); 459 460 const Index num_threads = blockDim.x * gridDim.x; 461 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 462 463 // Initialize the output values if they weren't initialized by the ReductionInitKernel 464 if (gridDim.x == 1) { 465 Index i = 2*thread_id; 466 for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) { 467 half* loc = output + i; 468 *((half2*)loc) = reducer.template initializePacket<half2>(); 469 } 470 if (i < num_preserved_coeffs) { 471 output[i] = reducer.initialize(); 472 } 473 __syncthreads(); 474 } 475 476 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { 477 const Index row = 2 * (i / input_col_blocks); 478 479 if (row + 1 < num_preserved_coeffs) { 480 const Index col_block = i % input_col_blocks; 481 const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x); 482 483 half2 reduced_val1 = reducer.template initializePacket<half2>(); 484 half2 reduced_val2 = reducer.template initializePacket<half2>(); 485 486 for (Index j = 0; j < NumPerThread; j += unroll_times) { 487 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2; 488 if (last_col >= num_coeffs_to_reduce) { 489 Index col = col_begin + blockDim.x * j; 490 for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) { 491 const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col); 492 reducer.reducePacket(val1, &reduced_val1); 493 const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col); 494 reducer.reducePacket(val2, &reduced_val2); 495 } 496 if (col < num_coeffs_to_reduce) { 497 // Peel; 498 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); 499 const half2 val1 = __halves2half2(last1, reducer.initialize()); 500 reducer.reducePacket(val1, &reduced_val1); 501 const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col); 502 const half2 val2 = __halves2half2(last2, reducer.initialize()); 503 reducer.reducePacket(val2, &reduced_val2); 504 } 505 break; 506 } else { 507 // Faster version of the loop with no branches after unrolling. 508 #pragma unroll 509 for (int k = 0; k < unroll_times; ++k) { 510 const Index col = col_begin + blockDim.x * (j + k) * 2; 511 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1); 512 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2); 513 } 514 } 515 } 516 517 #pragma unroll 518 for (int offset = warpSize/2; offset > 0; offset /= 2) { 519 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); 520 reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); 521 } 522 523 half val1 = __low2half(reduced_val1); 524 reducer.reduce(__high2half(reduced_val1), &val1); 525 half val2 = __low2half(reduced_val2); 526 reducer.reduce(__high2half(reduced_val2), &val2); 527 half2 val = __halves2half2(val1, val2); 528 529 if ((threadIdx.x & (warpSize - 1)) == 0) { 530 half* loc = output + row; 531 atomicReduce((half2*)loc, val, reducer); 532 } 533 } 534 } 535 } 536 537 #endif 538 539 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> 540 struct InnerReductionLauncher { 541 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { 542 assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); 543 return true; 544 } 545 }; 546 547 // Specialization for float and double 548 template <typename Self, typename Op, typename OutputType, bool PacketAccess> 549 struct InnerReductionLauncher< 550 Self, Op, OutputType, PacketAccess, 551 typename internal::enable_if< 552 internal::is_same<float, OutputType>::value || 553 internal::is_same<double, OutputType>::value, 554 void>::type> { 555 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 556 typedef typename Self::Index Index; 557 558 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; 559 const int block_size = 256; 560 const int num_per_thread = 128; 561 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 562 const int max_blocks = device.getNumCudaMultiProcessors() * 563 device.maxCudaThreadsPerMultiProcessor() / block_size; 564 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 565 566 if (num_blocks > 1) { 567 // We initialize the outputs outside the reduction kernel when we can't be sure that there 568 // won't be a race conditions between multiple thread blocks. 569 const int dyn_blocks = divup<int>(num_preserved_vals, 1024); 570 const int max_blocks = device.getNumCudaMultiProcessors() * 571 device.maxCudaThreadsPerMultiProcessor() / 1024; 572 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 573 LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), 574 num_blocks, 1024, 0, device, reducer.initialize(), 575 num_preserved_vals, output); 576 } 577 578 LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), 579 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); 580 581 return false; 582 } 583 }; 584 585 #ifdef EIGEN_HAS_CUDA_FP16 586 template <typename Self, typename Op> 587 struct InnerReductionLauncher<Self, Op, Eigen::half, false> { 588 static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { 589 assert(false && "Should not be called since there is no packet accessor"); 590 return true; 591 } 592 }; 593 594 template <typename Self, typename Op> 595 struct InnerReductionLauncher<Self, Op, Eigen::half, true> { 596 static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 597 typedef typename Self::Index Index; 598 599 if (num_preserved_vals % 2 != 0) { 600 // Not supported yet, revert to the slower code path 601 return true; 602 } 603 604 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; 605 const int block_size = /*256*/128; 606 const int num_per_thread = /*128*/64; 607 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 608 const int max_blocks = device.getNumCudaMultiProcessors() * 609 device.maxCudaThreadsPerMultiProcessor() / block_size; 610 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 611 612 if (num_blocks > 1) { 613 // We initialize the outputs outside the reduction kernel when we can't be sure that there 614 // won't be a race conditions between multiple thread blocks. 615 const int dyn_blocks = divup<int>(num_preserved_vals, 1024); 616 const int max_blocks = device.getNumCudaMultiProcessors() * 617 device.maxCudaThreadsPerMultiProcessor() / 1024; 618 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 619 LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 620 1, 1, 0, device, reducer, self, num_preserved_vals, output); 621 } 622 623 LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), 624 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); 625 626 return false; 627 } 628 }; 629 #endif 630 631 632 template <typename Self, typename Op> 633 struct InnerReducer<Self, Op, GpuDevice> { 634 // Unfortunately nvidia doesn't support well exotic types such as complex, 635 // so reduce the scope of the optimized version of the code to the simple case 636 // of floats and half floats. 637 #ifdef EIGEN_HAS_CUDA_FP16 638 static const bool HasOptimizedImplementation = !Op::IsStateful && 639 (internal::is_same<typename Self::CoeffReturnType, float>::value || 640 internal::is_same<typename Self::CoeffReturnType, double>::value || 641 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); 642 #else 643 static const bool HasOptimizedImplementation = !Op::IsStateful && 644 (internal::is_same<typename Self::CoeffReturnType, float>::value || 645 internal::is_same<typename Self::CoeffReturnType, double>::value); 646 #endif 647 648 template <typename OutputType> 649 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 650 assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); 651 const Index num_coeffs = array_prod(self.m_impl.dimensions()); 652 // Don't crash when we're called with an input tensor of size 0. 653 if (num_coeffs == 0) { 654 return true; 655 } 656 // It's faster to use the usual code. 657 if (num_coeffs_to_reduce <= 128) { 658 return true; 659 } 660 661 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); 662 } 663 }; 664 665 template <int NumPerThread, typename Self, 666 typename Reducer, typename Index> 667 __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, 668 typename Self::CoeffReturnType* output) { 669 const Index num_threads = blockDim.x * gridDim.x; 670 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; 671 // Initialize the output values if they weren't initialized by the ReductionInitKernel 672 if (gridDim.x == 1) { 673 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { 674 output[i] = reducer.initialize(); 675 } 676 __syncthreads(); 677 } 678 679 // Do the reduction. 680 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); 681 for (Index i = thread_id; i < max_iter; i += num_threads) { 682 const Index input_col = i % num_preserved_coeffs; 683 const Index input_row = (i / num_preserved_coeffs) * NumPerThread; 684 typename Self::CoeffReturnType reduced_val = reducer.initialize(); 685 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); 686 for (Index j = input_row; j < max_row; j++) { 687 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); 688 reducer.reduce(val, &reduced_val); 689 } 690 atomicReduce(&(output[input_col]), reduced_val, reducer); 691 } 692 } 693 694 695 template <typename Self, typename Op> 696 struct OuterReducer<Self, Op, GpuDevice> { 697 // Unfortunately nvidia doesn't support well exotic types such as complex, 698 // so reduce the scope of the optimized version of the code to the simple case 699 // of floats. 700 static const bool HasOptimizedImplementation = !Op::IsStateful && 701 (internal::is_same<typename Self::CoeffReturnType, float>::value || 702 internal::is_same<typename Self::CoeffReturnType, double>::value); 703 template <typename Device, typename OutputType> 704 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { 705 assert(false && "Should only be called to reduce doubles or floats on a gpu device"); 706 return true; 707 } 708 709 static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { 710 typedef typename Self::Index Index; 711 712 // It's faster to use the usual code. 713 if (num_coeffs_to_reduce <= 32) { 714 return true; 715 } 716 717 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; 718 const int block_size = 256; 719 const int num_per_thread = 16; 720 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); 721 const int max_blocks = device.getNumCudaMultiProcessors() * 722 device.maxCudaThreadsPerMultiProcessor() / block_size; 723 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 724 725 if (num_blocks > 1) { 726 // We initialize the outputs in the reduction kernel itself when we don't have to worry 727 // about race conditions between multiple thread blocks. 728 const int dyn_blocks = divup<int>(num_preserved_vals, 1024); 729 const int max_blocks = device.getNumCudaMultiProcessors() * 730 device.maxCudaThreadsPerMultiProcessor() / 1024; 731 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); 732 LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), 733 num_blocks, 1024, 0, device, reducer.initialize(), 734 num_preserved_vals, output); 735 } 736 737 LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), 738 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); 739 740 return false; 741 } 742 }; 743 744 #endif 745 746 747 } // end namespace internal 748 } // end namespace Eigen 749 750 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 751