1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 18 #define EIGEN_USE_GPU 19 20 #include <stdio.h> 21 22 #include "tensorflow/core/kernels/resize_nearest_neighbor_op.h" 23 24 #include "tensorflow/core/framework/register_types.h" 25 #include "tensorflow/core/framework/tensor_types.h" 26 #include "tensorflow/core/platform/types.h" 27 #include "tensorflow/core/util/cuda_kernel_helper.h" 28 29 namespace tensorflow { 30 31 typedef Eigen::GpuDevice GPUDevice; 32 33 namespace { 34 35 template <typename T, bool align_corners> 36 __global__ void ResizeNearestNeighborNHWC( 37 const int nthreads, const T* bottom_data, const int in_height, 38 const int in_width, const int channels, const int out_height, 39 const int out_width, const float height_scale, const float width_scale, 40 T* top_data) { 41 CUDA_1D_KERNEL_LOOP(index, nthreads) { 42 int n = index; 43 int c = n % channels; 44 n /= channels; 45 int out_x = n % out_width; 46 n /= out_width; 47 int out_y = n % out_height; 48 n /= out_height; 49 50 const T* bottom_data_n = bottom_data + n * channels * in_height * in_width; 51 const int in_y = 52 min((align_corners) ? static_cast<int>(roundf(out_y * height_scale)) 53 : static_cast<int>(floorf(out_y * height_scale)), 54 in_height - 1); 55 const int in_x = 56 min((align_corners) ? static_cast<int>(roundf(out_x * width_scale)) 57 : static_cast<int>(floorf(out_x * width_scale)), 58 in_width - 1); 59 const int idx = (in_y * in_width + in_x) * channels + c; 60 top_data[index] = ldg(bottom_data_n + idx); 61 } 62 } 63 64 template <typename T, bool align_corners> 65 __global__ void ResizeNearestNeighborBackwardNHWC( 66 const int nthreads, const T* top_diff, const int in_height, 67 const int in_width, const int channels, const int out_height, 68 const int out_width, const float height_scale, const float width_scale, 69 T* bottom_diff) { 70 CUDA_1D_KERNEL_LOOP(index, nthreads) { 71 int n = index; 72 int c = n % channels; 73 n /= channels; 74 int in_x = n % in_width; 75 n /= in_width; 76 int in_y = n % in_height; 77 n /= in_height; 78 79 T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width; 80 const int out_y = 81 min((align_corners) ? static_cast<int>(roundf(in_y * height_scale)) 82 : static_cast<int>(floorf(in_y * height_scale)), 83 out_height - 1); 84 const int out_x = 85 min((align_corners) ? static_cast<int>(roundf(in_x * width_scale)) 86 : static_cast<int>(floorf(in_x * width_scale)), 87 out_width - 1); 88 const int idx = (out_y * out_width + out_x) * channels + c; 89 CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); 90 } 91 } 92 93 } // namespace 94 95 namespace functor { 96 97 // Partial specialization of ResizeNearestNeighbor functor for a GPUDevice. 98 template <typename T, bool align_corners> 99 struct ResizeNearestNeighbor<GPUDevice, T, align_corners> { 100 bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 101 const float height_scale, const float width_scale, 102 typename TTypes<T, 4>::Tensor output) { 103 const int batch_size = input.dimension(0); 104 const int64 in_height = input.dimension(1); 105 const int64 in_width = input.dimension(2); 106 const int channels = input.dimension(3); 107 108 const int64 out_height = output.dimension(1); 109 const int64 out_width = output.dimension(2); 110 111 const int output_size = batch_size * out_height * out_width * channels; 112 if (output_size == 0) return true; 113 114 CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d); 115 ResizeNearestNeighborNHWC<T, align_corners> 116 <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 117 output_size, input.data(), in_height, in_width, channels, 118 out_height, out_width, height_scale, width_scale, output.data()); 119 return d.ok(); 120 } 121 }; 122 123 #define DECLARE_GPU_SPEC(T) \ 124 template struct ResizeNearestNeighbor<GPUDevice, T, false>; \ 125 template struct ResizeNearestNeighbor<GPUDevice, T, true>; 126 127 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); 128 129 #undef DECLARE_GPU_SPEC 130 131 // Partial specialization of ResizeNearestNeighborGrad functor for a GPUDevice. 132 template <typename T, bool align_corners> 133 struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> { 134 bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 135 const float height_scale, const float width_scale, 136 typename TTypes<T, 4>::Tensor output) { 137 const int batch_size = input.dimension(0); 138 const int64 in_height = input.dimension(1); 139 const int64 in_width = input.dimension(2); 140 const int channels = input.dimension(3); 141 142 const int64 out_height = output.dimension(1); 143 const int64 out_width = output.dimension(2); 144 145 const int output_size = batch_size * channels * out_height * out_width; 146 147 CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d); 148 SetZero<<<output_config.block_count, output_config.thread_per_block, 0, 149 d.stream()>>>(output_size, output.data()); 150 if (!d.ok()) return false; 151 152 const int input_size = batch_size * channels * in_height * in_width; 153 if (input_size == 0) return true; 154 155 CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d); 156 ResizeNearestNeighborBackwardNHWC<T, align_corners> 157 <<<input_config.block_count, input_config.thread_per_block, 0, 158 d.stream()>>>(input_config.virtual_thread_count, input.data(), 159 in_height, in_width, channels, out_height, out_width, 160 height_scale, width_scale, output.data()); 161 return d.ok(); 162 } 163 }; 164 165 #define DECLARE_GPU_SPEC(T) \ 166 template struct ResizeNearestNeighborGrad<GPUDevice, T, false>; \ 167 template struct ResizeNearestNeighborGrad<GPUDevice, T, true>; 168 169 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); 170 171 #undef DECLARE_GPU_SPEC 172 173 } // namespace functor 174 175 } // namespace tensorflow 176 177 #endif // GOOGLE_CUDA 178