1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 18 #define EIGEN_USE_GPU 19 20 #include "tensorflow/core/kernels/depthtospace_op.h" 21 22 #include "tensorflow/core/framework/tensor_types.h" 23 #include "tensorflow/core/platform/types.h" 24 #include "tensorflow/core/util/cuda_kernel_helper.h" 25 26 namespace tensorflow { 27 namespace { 28 29 using GPUDevice = Eigen::GpuDevice; 30 31 // Depth2Space kernel for FORMAT_NHWC. 32 // See 'depthtospace_op.h' for a more detailed description. 33 template <typename dtype> 34 __global__ void D2S_NHWC(const int32 nthreads, 35 const dtype* __restrict__ input_ptr, 36 const int block_size, const int batch_size, 37 const int input_height, const int input_width, 38 const int input_depth, const int output_height, 39 const int output_width, const int output_depth, 40 dtype* __restrict__ output_ptr) { 41 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 42 // out_idx = d + output_depth * (w + output_width * (h + output_height * b)) 43 const int d = out_idx % output_depth; 44 const int out_idx2 = out_idx / output_depth; 45 const int w = out_idx2 % output_width; 46 const int out_idx3 = out_idx2 / output_width; 47 const int h = out_idx3 % output_height; 48 const int b = out_idx3 / output_height; 49 50 const int in_h = h / block_size; 51 const int offset_h = h % block_size; 52 const int in_w = w / block_size; 53 const int offset_w = w % block_size; 54 const int offset_d = (offset_h * block_size + offset_w) * output_depth; 55 const int in_d = d + offset_d; 56 const int inp_idx = 57 in_d + input_depth * (in_w + input_width * (in_h + input_height * b)); 58 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); 59 } 60 } 61 62 // Depth2Space kernel for FORMAT_NCHW. 63 // See 'spacetodepth_op.h' for a more detailed description. 64 template <typename dtype> 65 __global__ void D2S_NCHW(const int32 nthreads, 66 const dtype* __restrict__ input_ptr, 67 const int block_size, const int input_width, 68 const int output_depth_by_input_height, 69 dtype* __restrict__ output_ptr) { 70 CUDA_1D_KERNEL_LOOP(input_idx, nthreads) { 71 // We will be converting the image from ordering: 72 // n, bY, bX, oC, iY, iX (== input_idx) to 73 // n, oC, iY, bY, iX, bX 74 75 // Start reading the input data straight away since we know the address. 76 // We calculate the output address in parallel while this is being fetched. 77 78 const int n_bY_bX_oC_iY = input_idx / input_width; 79 const int iX = input_idx - n_bY_bX_oC_iY * input_width; 80 81 const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height; 82 const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height; 83 84 const int n_bY = n_bY_bX / block_size; 85 const int bX = n_bY_bX - n_bY * block_size; 86 87 const int n = n_bY / block_size; 88 const int bY = n_bY - n * block_size; 89 90 const int output_idx = 91 bX + 92 block_size * 93 (iX + input_width * 94 (bY + block_size * 95 (oC_iY + n * output_depth_by_input_height))); 96 97 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); 98 } 99 } 100 101 template <typename dtype, int block_size> 102 __global__ void D2S_NCHW_LOOP(const int32 nthreads, 103 const dtype* __restrict__ input, 104 const int input_width, const int output_width, 105 const int output_depth_by_input_area, 106 const int input_depth_by_input_area, 107 dtype* __restrict__ output) { 108 CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { 109 // We will be converting the image from ordering: 110 // n, bY, bX, oC, iY, iX to 111 // n, oC, iY, bY, iX, bX 112 113 // We assume thread_idx encodes n_oC_iY_iX, and use an unrolled loop over 114 // bY and bX coordinates within the block. This kernel is significantly 115 // more performant than the D2S_NCHW kernel. 116 // A likely explanation of the improvement is that although both kernels 117 // get input coalescing, this one would write the output data more densely 118 // per warp, so would benefit assuming delayed cache writeback is used. 119 120 const int n_oC_iY = thread_idx / input_width; 121 const int iX = thread_idx - n_oC_iY * input_width; 122 123 const int n = thread_idx / output_depth_by_input_area; 124 const int oC_iY_iX = thread_idx - n * output_depth_by_input_area; 125 126 // Recombine the components and apply to the input and output pointers. 127 auto input_ptr = input + n * input_depth_by_input_area + oC_iY_iX; 128 auto output_ptr = output + (n_oC_iY * output_width + iX) * block_size; 129 130 #pragma unroll 131 // Copy a patch of data to the output batch image. 132 for (int bY = 0; bY < block_size; ++bY) { 133 #pragma unroll 134 for (int bX = 0; bX < block_size; ++bX) { 135 output_ptr[bY * output_width + bX] = ldg( 136 input_ptr + (bY * block_size + bX) * output_depth_by_input_area); 137 } 138 } 139 } 140 } 141 142 } // namespace 143 144 // Specialization of DepthToSpaceOpFunctor for a GPUDevice. 145 namespace functor { 146 147 template <typename T> 148 struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NHWC> { 149 void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 150 int block_size, typename TTypes<T, 4>::Tensor output) { 151 const int batch_size = output.dimension(0); 152 const int input_height = input.dimension(1); 153 const int input_width = input.dimension(2); 154 const int input_depth = input.dimension(3); 155 const int output_height = output.dimension(1); 156 const int output_width = output.dimension(2); 157 const int output_depth = output.dimension(3); 158 159 const int total_count = 160 batch_size * output_height * output_width * output_depth; 161 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); 162 D2S_NHWC<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 163 config.virtual_thread_count, input.data(), block_size, batch_size, 164 input_height, input_width, input_depth, output_height, output_width, 165 output_depth, output.data()); 166 } 167 void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input, 168 int block_size, typename TTypes<T, 5>::Tensor output) { 169 LOG(FATAL) << "5-D tensors should not be used with NHWC format"; 170 } 171 }; 172 173 template <typename T> 174 struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> { 175 void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 176 int block_size, typename TTypes<T, 4>::Tensor output) { 177 const int batch_size = input.dimension(0); 178 const int input_depth = input.dimension(1); 179 const int input_height = input.dimension(2); 180 const int input_width = input.dimension(3); 181 const int output_depth = output.dimension(1); 182 const int input_area = input_width * input_height; 183 const int input_depth_by_input_area = input_depth * input_area; 184 185 // We improve performance by generating instantiations of the loop kernel 186 // for the most common block sizes. 187 if (block_size <= 4) { 188 const int output_width = output.dimension(3); 189 const int output_depth_by_input_area = output_depth * input_area; 190 const int total_count = batch_size * output_depth_by_input_area; 191 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); 192 switch (block_size) { 193 case 2: 194 return D2S_NCHW_LOOP<T, 2> 195 <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 196 total_count, input.data(), input_width, output_width, 197 output_depth_by_input_area, input_depth_by_input_area, 198 output.data()); 199 case 3: 200 return D2S_NCHW_LOOP<T, 3> 201 <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 202 total_count, input.data(), input_width, output_width, 203 output_depth_by_input_area, input_depth_by_input_area, 204 output.data()); 205 case 4: 206 return D2S_NCHW_LOOP<T, 4> 207 <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 208 total_count, input.data(), input_width, output_width, 209 output_depth_by_input_area, input_depth_by_input_area, 210 output.data()); 211 } 212 } 213 214 // Other block sizes are processed by the generic kernel. 215 const int total_count = batch_size * input_depth_by_input_area; 216 auto config = GetCudaLaunchConfig(total_count, d); 217 D2S_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 218 config.virtual_thread_count, input.data(), block_size, input_width, 219 output_depth * input_height, output.data()); 220 } 221 void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input, 222 int block_size, typename TTypes<T, 5>::Tensor output) { 223 LOG(FATAL) << "5-D tensors should not be used with NCHW format"; 224 } 225 }; 226 } // end namespace functor 227 228 // Instantiate the GPU implementations for float. 229 template struct functor::DepthToSpaceOpFunctor<GPUDevice, float, FORMAT_NCHW>; 230 template struct functor::DepthToSpaceOpFunctor<GPUDevice, float, FORMAT_NHWC>; 231 232 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32. 233 template struct functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW>; 234 235 } // end namespace tensorflow 236 237 #endif // GOOGLE_CUDA 238