Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #define EIGEN_USE_GPU
     19 
     20 #include <stdio.h>
     21 
     22 #include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
     23 
     24 #include "tensorflow/core/framework/register_types.h"
     25 #include "tensorflow/core/framework/tensor_types.h"
     26 #include "tensorflow/core/platform/types.h"
     27 #include "tensorflow/core/util/cuda_kernel_helper.h"
     28 
     29 namespace tensorflow {
     30 
     31 typedef Eigen::GpuDevice GPUDevice;
     32 
     33 namespace {
     34 
     35 template <typename T, bool align_corners>
     36 __global__ void ResizeNearestNeighborNHWC(
     37     const int nthreads, const T* bottom_data, const int in_height,
     38     const int in_width, const int channels, const int out_height,
     39     const int out_width, const float height_scale, const float width_scale,
     40     T* top_data) {
     41   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     42     int n = index;
     43     int c = n % channels;
     44     n /= channels;
     45     int out_x = n % out_width;
     46     n /= out_width;
     47     int out_y = n % out_height;
     48     n /= out_height;
     49 
     50     const T* bottom_data_n = bottom_data + n * channels * in_height * in_width;
     51     const int in_y =
     52         min((align_corners) ? static_cast<int>(roundf(out_y * height_scale))
     53                             : static_cast<int>(floorf(out_y * height_scale)),
     54             in_height - 1);
     55     const int in_x =
     56         min((align_corners) ? static_cast<int>(roundf(out_x * width_scale))
     57                             : static_cast<int>(floorf(out_x * width_scale)),
     58             in_width - 1);
     59     const int idx = (in_y * in_width + in_x) * channels + c;
     60     top_data[index] = ldg(bottom_data_n + idx);
     61   }
     62 }
     63 
     64 template <typename T, bool align_corners>
     65 __global__ void ResizeNearestNeighborBackwardNHWC(
     66     const int nthreads, const T* top_diff, const int in_height,
     67     const int in_width, const int channels, const int out_height,
     68     const int out_width, const float height_scale, const float width_scale,
     69     T* bottom_diff) {
     70   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     71     int n = index;
     72     int c = n % channels;
     73     n /= channels;
     74     int in_x = n % in_width;
     75     n /= in_width;
     76     int in_y = n % in_height;
     77     n /= in_height;
     78 
     79     T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width;
     80     const int out_y =
     81         min((align_corners) ? static_cast<int>(roundf(in_y * height_scale))
     82                             : static_cast<int>(floorf(in_y * height_scale)),
     83             out_height - 1);
     84     const int out_x =
     85         min((align_corners) ? static_cast<int>(roundf(in_x * width_scale))
     86                             : static_cast<int>(floorf(in_x * width_scale)),
     87             out_width - 1);
     88     const int idx = (out_y * out_width + out_x) * channels + c;
     89     CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index));
     90   }
     91 }
     92 
     93 }  // namespace
     94 
     95 namespace functor {
     96 
     97 // Partial specialization of ResizeNearestNeighbor functor for a GPUDevice.
     98 template <typename T, bool align_corners>
     99 struct ResizeNearestNeighbor<GPUDevice, T, align_corners> {
    100   bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    101                   const float height_scale, const float width_scale,
    102                   typename TTypes<T, 4>::Tensor output) {
    103     const int batch_size = input.dimension(0);
    104     const int64 in_height = input.dimension(1);
    105     const int64 in_width = input.dimension(2);
    106     const int channels = input.dimension(3);
    107 
    108     const int64 out_height = output.dimension(1);
    109     const int64 out_width = output.dimension(2);
    110 
    111     const int output_size = batch_size * out_height * out_width * channels;
    112     if (output_size == 0) return true;
    113 
    114     CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
    115     ResizeNearestNeighborNHWC<T, align_corners>
    116         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    117             output_size, input.data(), in_height, in_width, channels,
    118             out_height, out_width, height_scale, width_scale, output.data());
    119     return d.ok();
    120   }
    121 };
    122 
    123 #define DECLARE_GPU_SPEC(T)                                   \
    124   template struct ResizeNearestNeighbor<GPUDevice, T, false>; \
    125   template struct ResizeNearestNeighbor<GPUDevice, T, true>;
    126 
    127 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
    128 
    129 #undef DECLARE_GPU_SPEC
    130 
    131 // Partial specialization of ResizeNearestNeighborGrad functor for a GPUDevice.
    132 template <typename T, bool align_corners>
    133 struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> {
    134   bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    135                   const float height_scale, const float width_scale,
    136                   typename TTypes<T, 4>::Tensor output) {
    137     const int batch_size = input.dimension(0);
    138     const int64 in_height = input.dimension(1);
    139     const int64 in_width = input.dimension(2);
    140     const int channels = input.dimension(3);
    141 
    142     const int64 out_height = output.dimension(1);
    143     const int64 out_width = output.dimension(2);
    144 
    145     const int output_size = batch_size * channels * out_height * out_width;
    146 
    147     CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
    148     SetZero<<<output_config.block_count, output_config.thread_per_block, 0,
    149               d.stream()>>>(output_size, output.data());
    150     if (!d.ok()) return false;
    151 
    152     const int input_size = batch_size * channels * in_height * in_width;
    153     if (input_size == 0) return true;
    154 
    155     CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
    156     ResizeNearestNeighborBackwardNHWC<T, align_corners>
    157         <<<input_config.block_count, input_config.thread_per_block, 0,
    158            d.stream()>>>(input_config.virtual_thread_count, input.data(),
    159                          in_height, in_width, channels, out_height, out_width,
    160                          height_scale, width_scale, output.data());
    161     return d.ok();
    162   }
    163 };
    164 
    165 #define DECLARE_GPU_SPEC(T)                                       \
    166   template struct ResizeNearestNeighborGrad<GPUDevice, T, false>; \
    167   template struct ResizeNearestNeighborGrad<GPUDevice, T, true>;
    168 
    169 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
    170 
    171 #undef DECLARE_GPU_SPEC
    172 
    173 }  // namespace functor
    174 
    175 }  // namespace tensorflow
    176 
    177 #endif  // GOOGLE_CUDA
    178