Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/image_ops.cc.
     17 
     18 #if GOOGLE_CUDA
     19 
     20 #define EIGEN_USE_GPU
     21 
     22 #include "tensorflow/core/kernels/crop_and_resize_op.h"
     23 
     24 #include "tensorflow/core/framework/register_types.h"
     25 #include "tensorflow/core/framework/tensor_types.h"
     26 #include "tensorflow/core/platform/types.h"
     27 #include "tensorflow/core/util/cuda_kernel_helper.h"
     28 
     29 namespace tensorflow {
     30 
     31 typedef Eigen::GpuDevice GPUDevice;
     32 
     33 namespace {
     34 
     35 template <typename T>
     36 __global__ void CropAndResizeKernel(
     37     const int32 nthreads, const T* image_ptr, const float* boxes_ptr,
     38     const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
     39     int image_width, int crop_height, int crop_width, int depth,
     40     float extrapolation_value, float* crops_ptr) {
     41   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     42     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
     43     int idx = out_idx;
     44     const int d = idx % depth;
     45     idx /= depth;
     46     const int x = idx % crop_width;
     47     idx /= crop_width;
     48     const int y = idx % crop_height;
     49     const int b = idx / crop_height;
     50 
     51     const float y1 = boxes_ptr[b * 4];
     52     const float x1 = boxes_ptr[b * 4 + 1];
     53     const float y2 = boxes_ptr[b * 4 + 2];
     54     const float x2 = boxes_ptr[b * 4 + 3];
     55 
     56     const int32 b_in = box_ind_ptr[b];
     57     if (b_in < 0 || b_in >= batch) {
     58       continue;
     59     }
     60 
     61     const float height_scale =
     62         (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
     63                           : 0;
     64     const float width_scale =
     65         (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) : 0;
     66 
     67     const float in_y = (crop_height > 1)
     68                            ? y1 * (image_height - 1) + y * height_scale
     69                            : 0.5 * (y1 + y2) * (image_height - 1);
     70     if (in_y < 0 || in_y > image_height - 1) {
     71       crops_ptr[out_idx] = extrapolation_value;
     72       continue;
     73     }
     74 
     75     const float in_x = (crop_width > 1)
     76                            ? x1 * (image_width - 1) + x * width_scale
     77                            : 0.5 * (x1 + x2) * (image_width - 1);
     78     if (in_x < 0 || in_x > image_width - 1) {
     79       crops_ptr[out_idx] = extrapolation_value;
     80       continue;
     81     }
     82 
     83     const int top_y_index = floorf(in_y);
     84     const int bottom_y_index = ceilf(in_y);
     85     const float y_lerp = in_y - top_y_index;
     86 
     87     const int left_x_index = floorf(in_x);
     88     const int right_x_index = ceilf(in_x);
     89     const float x_lerp = in_x - left_x_index;
     90 
     91     const float top_left(static_cast<float>(
     92         image_ptr[((b_in * image_height + top_y_index) * image_width +
     93                    left_x_index) *
     94                       depth +
     95                   d]));
     96     const float top_right(static_cast<float>(
     97         image_ptr[((b_in * image_height + top_y_index) * image_width +
     98                    right_x_index) *
     99                       depth +
    100                   d]));
    101     const float bottom_left(static_cast<float>(
    102         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
    103                    left_x_index) *
    104                       depth +
    105                   d]));
    106     const float bottom_right(static_cast<float>(
    107         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
    108                    right_x_index) *
    109                       depth +
    110                   d]));
    111     const float top = top_left + (top_right - top_left) * x_lerp;
    112     const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
    113     crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
    114   }
    115 }
    116 
    117 template <typename T>
    118 __global__ void CropAndResizeBackpropImageKernel(
    119     const int32 nthreads, const float* grads_ptr, const float* boxes_ptr,
    120     const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
    121     int image_width, int crop_height, int crop_width, int depth,
    122     T* grads_image_ptr) {
    123   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
    124     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
    125     int idx = out_idx;
    126     const int d = idx % depth;
    127     idx /= depth;
    128     const int x = idx % crop_width;
    129     idx /= crop_width;
    130     const int y = idx % crop_height;
    131     const int b = idx / crop_height;
    132 
    133     const float y1 = boxes_ptr[b * 4];
    134     const float x1 = boxes_ptr[b * 4 + 1];
    135     const float y2 = boxes_ptr[b * 4 + 2];
    136     const float x2 = boxes_ptr[b * 4 + 3];
    137 
    138     const int32 b_in = box_ind_ptr[b];
    139     if (b_in < 0 || b_in >= batch) {
    140       continue;
    141     }
    142 
    143     const float height_scale =
    144         (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
    145                           : 0;
    146     const float width_scale =
    147         (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) : 0;
    148 
    149     const float in_y = (crop_height > 1)
    150                            ? y1 * (image_height - 1) + y * height_scale
    151                            : 0.5 * (y1 + y2) * (image_height - 1);
    152     if (in_y < 0 || in_y > image_height - 1) {
    153       continue;
    154     }
    155 
    156     const float in_x = (crop_width > 1)
    157                            ? x1 * (image_width - 1) + x * width_scale
    158                            : 0.5 * (x1 + x2) * (image_width - 1);
    159     if (in_x < 0 || in_x > image_width - 1) {
    160       continue;
    161     }
    162 
    163     const int top_y_index = floorf(in_y);
    164     const int bottom_y_index = ceilf(in_y);
    165     const float y_lerp = in_y - top_y_index;
    166 
    167     const int left_x_index = floorf(in_x);
    168     const int right_x_index = ceilf(in_x);
    169     const float x_lerp = in_x - left_x_index;
    170 
    171     const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
    172     CudaAtomicAdd(
    173         grads_image_ptr +
    174             ((b_in * image_height + top_y_index) * image_width + left_x_index) *
    175                 depth +
    176             d,
    177         static_cast<T>((1 - x_lerp) * dtop));
    178     CudaAtomicAdd(grads_image_ptr +
    179                       ((b_in * image_height + top_y_index) * image_width +
    180                        right_x_index) *
    181                           depth +
    182                       d,
    183                   static_cast<T>(x_lerp * dtop));
    184 
    185     const float dbottom = y_lerp * grads_ptr[out_idx];
    186     CudaAtomicAdd(grads_image_ptr +
    187                       ((b_in * image_height + bottom_y_index) * image_width +
    188                        left_x_index) *
    189                           depth +
    190                       d,
    191                   static_cast<T>((1 - x_lerp) * dbottom));
    192     CudaAtomicAdd(grads_image_ptr +
    193                       ((b_in * image_height + bottom_y_index) * image_width +
    194                        right_x_index) *
    195                           depth +
    196                       d,
    197                   static_cast<T>(x_lerp * dbottom));
    198   }
    199 }
    200 
    201 template <typename T>
    202 __global__ void CropAndResizeBackpropBoxesKernel(
    203     const int32 nthreads, const float* grads_ptr, const T* image_ptr,
    204     const float* boxes_ptr, const int32* box_ind_ptr, int num_boxes, int batch,
    205     int image_height, int image_width, int crop_height, int crop_width,
    206     int depth, float* grads_boxes_ptr) {
    207   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
    208     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
    209     int idx = out_idx;
    210     const int d = idx % depth;
    211     idx /= depth;
    212     const int x = idx % crop_width;
    213     idx /= crop_width;
    214     const int y = idx % crop_height;
    215     const int b = idx / crop_height;
    216 
    217     const float y1 = boxes_ptr[b * 4];
    218     const float x1 = boxes_ptr[b * 4 + 1];
    219     const float y2 = boxes_ptr[b * 4 + 2];
    220     const float x2 = boxes_ptr[b * 4 + 3];
    221 
    222     const int32 b_in = box_ind_ptr[b];
    223     if (b_in < 0 || b_in >= batch) {
    224       continue;
    225     }
    226 
    227     const float height_ratio =
    228         (crop_height > 1)
    229             ? static_cast<float>(image_height - 1) / (crop_height - 1)
    230             : 0;
    231     const float width_ratio =
    232         (crop_width > 1)
    233             ? static_cast<float>(image_width - 1) / (crop_width - 1)
    234             : 0;
    235 
    236     const float height_scale = (crop_height > 1) ? (y2 - y1) * height_ratio : 0;
    237     const float width_scale = (crop_width > 1) ? (x2 - x1) * width_ratio : 0;
    238 
    239     const float in_y = (crop_height > 1)
    240                            ? y1 * (image_height - 1) + y * height_scale
    241                            : 0.5 * (y1 + y2) * (image_height - 1);
    242     if (in_y < 0 || in_y > image_height - 1) {
    243       continue;
    244     }
    245 
    246     const float in_x = (crop_width > 1)
    247                            ? x1 * (image_width - 1) + x * width_scale
    248                            : 0.5 * (x1 + x2) * (image_width - 1);
    249     if (in_x < 0 || in_x > image_width - 1) {
    250       continue;
    251     }
    252 
    253     const int top_y_index = floorf(in_y);
    254     const int bottom_y_index = ceilf(in_y);
    255     const float y_lerp = in_y - top_y_index;
    256 
    257     const int left_x_index = floorf(in_x);
    258     const int right_x_index = ceilf(in_x);
    259     const float x_lerp = in_x - left_x_index;
    260 
    261     const float top_left(static_cast<float>(
    262         image_ptr[((b_in * image_height + top_y_index) * image_width +
    263                    left_x_index) *
    264                       depth +
    265                   d]));
    266     const float top_right(static_cast<float>(
    267         image_ptr[((b_in * image_height + top_y_index) * image_width +
    268                    right_x_index) *
    269                       depth +
    270                   d]));
    271     const float bottom_left(static_cast<float>(
    272         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
    273                    left_x_index) *
    274                       depth +
    275                   d]));
    276     const float bottom_right(static_cast<float>(
    277         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
    278                    right_x_index) *
    279                       depth +
    280                   d]));
    281 
    282     // Compute the image gradient.
    283     float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) +
    284                          x_lerp * (bottom_right - top_right);
    285     float image_grad_x = (1 - y_lerp) * (top_right - top_left) +
    286                          y_lerp * (bottom_right - bottom_left);
    287     // Modulate the image gradient with the incoming gradient.
    288     const float top_grad = grads_ptr[out_idx];
    289     image_grad_y *= top_grad;
    290     image_grad_x *= top_grad;
    291 
    292     float dy1, dy2;
    293     if (crop_height > 1) {
    294       dy1 = image_grad_y * (image_height - 1 - y * height_ratio);
    295       dy2 = image_grad_y * (y * height_ratio);
    296     } else {
    297       dy1 = image_grad_y * 0.5 * (image_height - 1);
    298       dy2 = image_grad_y * 0.5 * (image_height - 1);
    299     }
    300 
    301     float dx1, dx2;
    302     if (crop_width > 1) {
    303       dx1 = image_grad_x * (image_width - 1 - x * width_ratio);
    304       dx2 = image_grad_x * (x * width_ratio);
    305     } else {
    306       dx1 = image_grad_x * 0.5 * (image_width - 1);
    307       dx2 = image_grad_x * 0.5 * (image_width - 1);
    308     }
    309 
    310     CudaAtomicAdd(grads_boxes_ptr + b * 4 + 0, dy1);
    311     CudaAtomicAdd(grads_boxes_ptr + b * 4 + 1, dx1);
    312     CudaAtomicAdd(grads_boxes_ptr + b * 4 + 2, dy2);
    313     CudaAtomicAdd(grads_boxes_ptr + b * 4 + 3, dx2);
    314   }
    315 }
    316 
    317 }  // namespace
    318 
    319 namespace functor {
    320 
    321 template <typename T>
    322 struct CropAndResize<GPUDevice, T> {
    323   bool operator()(const OpKernelContext* context,
    324                   typename TTypes<T, 4>::ConstTensor image,
    325                   typename TTypes<float, 2>::ConstTensor boxes,
    326                   typename TTypes<int32, 1>::ConstTensor box_ind,
    327                   float extrapolation_value,
    328                   typename TTypes<float, 4>::Tensor crops) {
    329     const int batch = image.dimension(0);
    330     const int image_height = image.dimension(1);
    331     const int image_width = image.dimension(2);
    332 
    333     const int num_boxes = crops.dimension(0);
    334     const int crop_height = crops.dimension(1);
    335     const int crop_width = crops.dimension(2);
    336     const int depth = crops.dimension(3);
    337 
    338     const int total_count = num_boxes * crop_height * crop_width * depth;
    339     const GPUDevice& d = context->eigen_device<GPUDevice>();
    340 
    341     if (total_count > 0) {
    342       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
    343       CropAndResizeKernel<<<config.block_count, config.thread_per_block, 0,
    344                             d.stream()>>>(
    345           config.virtual_thread_count, image.data(), boxes.data(),
    346           box_ind.data(), num_boxes, batch, image_height, image_width,
    347           crop_height, crop_width, depth, extrapolation_value, crops.data());
    348     }
    349     return d.ok();
    350   }
    351 };
    352 
    353 template <typename T>
    354 struct CropAndResizeBackpropImage<GPUDevice, T> {
    355   bool operator()(const GPUDevice& d,
    356                   typename TTypes<float, 4>::ConstTensor grads,
    357                   typename TTypes<float, 2>::ConstTensor boxes,
    358                   typename TTypes<int32, 1>::ConstTensor box_ind,
    359                   typename TTypes<T, 4>::Tensor grads_image) {
    360     const int batch = grads_image.dimension(0);
    361     const int image_height = grads_image.dimension(1);
    362     const int image_width = grads_image.dimension(2);
    363 
    364     const int num_boxes = grads.dimension(0);
    365     const int crop_height = grads.dimension(1);
    366     const int crop_width = grads.dimension(2);
    367     const int depth = grads.dimension(3);
    368 
    369     int total_count;
    370     CudaLaunchConfig config;
    371 
    372     // Initialize grads_image with all zeros.
    373     total_count = batch * image_height * image_width * depth;
    374     if (total_count > 0) {
    375       config = GetCudaLaunchConfig(total_count, d);
    376       SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    377           config.virtual_thread_count, grads_image.data());
    378     }
    379 
    380     // Accumulate.
    381     total_count = num_boxes * crop_height * crop_width * depth;
    382     if (total_count > 0) {
    383       config = GetCudaLaunchConfig(total_count, d);
    384       CropAndResizeBackpropImageKernel<<<
    385           config.block_count, config.thread_per_block, 0, d.stream()>>>(
    386           config.virtual_thread_count, grads.data(), boxes.data(),
    387           box_ind.data(), num_boxes, batch, image_height, image_width,
    388           crop_height, crop_width, depth, grads_image.data());
    389     }
    390     return d.ok();
    391   }
    392 };
    393 
    394 template <typename T>
    395 struct CropAndResizeBackpropBoxes<GPUDevice, T> {
    396   bool operator()(const GPUDevice& d,
    397                   typename TTypes<float, 4>::ConstTensor grads,
    398                   typename TTypes<T, 4>::ConstTensor image,
    399                   typename TTypes<float, 2>::ConstTensor boxes,
    400                   typename TTypes<int32, 1>::ConstTensor box_ind,
    401                   typename TTypes<float, 2>::Tensor grads_boxes) {
    402     const int batch = image.dimension(0);
    403     const int image_height = image.dimension(1);
    404     const int image_width = image.dimension(2);
    405 
    406     const int num_boxes = grads.dimension(0);
    407     const int crop_height = grads.dimension(1);
    408     const int crop_width = grads.dimension(2);
    409     const int depth = grads.dimension(3);
    410 
    411     int total_count;
    412     CudaLaunchConfig config;
    413 
    414     // Initialize grads_boxes with all zeros.
    415     total_count = num_boxes * 4;
    416     if (total_count > 0) {
    417       config = GetCudaLaunchConfig(total_count, d);
    418       SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    419           config.virtual_thread_count, grads_boxes.data());
    420     }
    421 
    422     // Accumulate.
    423     total_count = num_boxes * crop_height * crop_width * depth;
    424     if (total_count > 0) {
    425       config = GetCudaLaunchConfig(total_count, d);
    426       CropAndResizeBackpropBoxesKernel<<<
    427           config.block_count, config.thread_per_block, 0, d.stream()>>>(
    428           config.virtual_thread_count, grads.data(), image.data(), boxes.data(),
    429           box_ind.data(), num_boxes, batch, image_height, image_width,
    430           crop_height, crop_width, depth, grads_boxes.data());
    431     }
    432     return d.ok();
    433   }
    434 };
    435 
    436 #define DEFINE_GPU_SPECS(T)                                 \
    437   template struct CropAndResize<GPUDevice, T>;              \
    438   template struct CropAndResizeBackpropImage<GPUDevice, T>; \
    439   template struct CropAndResizeBackpropBoxes<GPUDevice, T>;
    440 
    441 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
    442 
    443 #undef DEFINE_GPU_SPECS
    444 
    445 template struct CheckValidBoxIndexHelper<GPUDevice>;
    446 
    447 }  // namespace functor
    448 }  // namespace tensorflow
    449 
    450 #endif  // GOOGLE_CUDA
    451