1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/image_ops.cc. 17 18 #if GOOGLE_CUDA 19 20 #define EIGEN_USE_GPU 21 22 #include "tensorflow/core/kernels/crop_and_resize_op.h" 23 24 #include "tensorflow/core/framework/register_types.h" 25 #include "tensorflow/core/framework/tensor_types.h" 26 #include "tensorflow/core/platform/types.h" 27 #include "tensorflow/core/util/cuda_kernel_helper.h" 28 29 namespace tensorflow { 30 31 typedef Eigen::GpuDevice GPUDevice; 32 33 namespace { 34 35 template <typename T> 36 __global__ void CropAndResizeKernel( 37 const int32 nthreads, const T* image_ptr, const float* boxes_ptr, 38 const int32* box_ind_ptr, int num_boxes, int batch, int image_height, 39 int image_width, int crop_height, int crop_width, int depth, 40 float extrapolation_value, float* crops_ptr) { 41 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 42 // out_idx = d + depth * (w + crop_width * (h + crop_height * b)) 43 int idx = out_idx; 44 const int d = idx % depth; 45 idx /= depth; 46 const int x = idx % crop_width; 47 idx /= crop_width; 48 const int y = idx % crop_height; 49 const int b = idx / crop_height; 50 51 const float y1 = boxes_ptr[b * 4]; 52 const float x1 = boxes_ptr[b * 4 + 1]; 53 const float y2 = boxes_ptr[b * 4 + 2]; 54 const float x2 = boxes_ptr[b * 4 + 3]; 55 56 const int32 b_in = box_ind_ptr[b]; 57 if (b_in < 0 || b_in >= batch) { 58 continue; 59 } 60 61 const float height_scale = 62 (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1) 63 : 0; 64 const float width_scale = 65 (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) : 0; 66 67 const float in_y = (crop_height > 1) 68 ? y1 * (image_height - 1) + y * height_scale 69 : 0.5 * (y1 + y2) * (image_height - 1); 70 if (in_y < 0 || in_y > image_height - 1) { 71 crops_ptr[out_idx] = extrapolation_value; 72 continue; 73 } 74 75 const float in_x = (crop_width > 1) 76 ? x1 * (image_width - 1) + x * width_scale 77 : 0.5 * (x1 + x2) * (image_width - 1); 78 if (in_x < 0 || in_x > image_width - 1) { 79 crops_ptr[out_idx] = extrapolation_value; 80 continue; 81 } 82 83 const int top_y_index = floorf(in_y); 84 const int bottom_y_index = ceilf(in_y); 85 const float y_lerp = in_y - top_y_index; 86 87 const int left_x_index = floorf(in_x); 88 const int right_x_index = ceilf(in_x); 89 const float x_lerp = in_x - left_x_index; 90 91 const float top_left(static_cast<float>( 92 image_ptr[((b_in * image_height + top_y_index) * image_width + 93 left_x_index) * 94 depth + 95 d])); 96 const float top_right(static_cast<float>( 97 image_ptr[((b_in * image_height + top_y_index) * image_width + 98 right_x_index) * 99 depth + 100 d])); 101 const float bottom_left(static_cast<float>( 102 image_ptr[((b_in * image_height + bottom_y_index) * image_width + 103 left_x_index) * 104 depth + 105 d])); 106 const float bottom_right(static_cast<float>( 107 image_ptr[((b_in * image_height + bottom_y_index) * image_width + 108 right_x_index) * 109 depth + 110 d])); 111 const float top = top_left + (top_right - top_left) * x_lerp; 112 const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; 113 crops_ptr[out_idx] = top + (bottom - top) * y_lerp; 114 } 115 } 116 117 template <typename T> 118 __global__ void CropAndResizeBackpropImageKernel( 119 const int32 nthreads, const float* grads_ptr, const float* boxes_ptr, 120 const int32* box_ind_ptr, int num_boxes, int batch, int image_height, 121 int image_width, int crop_height, int crop_width, int depth, 122 T* grads_image_ptr) { 123 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 124 // out_idx = d + depth * (w + crop_width * (h + crop_height * b)) 125 int idx = out_idx; 126 const int d = idx % depth; 127 idx /= depth; 128 const int x = idx % crop_width; 129 idx /= crop_width; 130 const int y = idx % crop_height; 131 const int b = idx / crop_height; 132 133 const float y1 = boxes_ptr[b * 4]; 134 const float x1 = boxes_ptr[b * 4 + 1]; 135 const float y2 = boxes_ptr[b * 4 + 2]; 136 const float x2 = boxes_ptr[b * 4 + 3]; 137 138 const int32 b_in = box_ind_ptr[b]; 139 if (b_in < 0 || b_in >= batch) { 140 continue; 141 } 142 143 const float height_scale = 144 (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1) 145 : 0; 146 const float width_scale = 147 (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) : 0; 148 149 const float in_y = (crop_height > 1) 150 ? y1 * (image_height - 1) + y * height_scale 151 : 0.5 * (y1 + y2) * (image_height - 1); 152 if (in_y < 0 || in_y > image_height - 1) { 153 continue; 154 } 155 156 const float in_x = (crop_width > 1) 157 ? x1 * (image_width - 1) + x * width_scale 158 : 0.5 * (x1 + x2) * (image_width - 1); 159 if (in_x < 0 || in_x > image_width - 1) { 160 continue; 161 } 162 163 const int top_y_index = floorf(in_y); 164 const int bottom_y_index = ceilf(in_y); 165 const float y_lerp = in_y - top_y_index; 166 167 const int left_x_index = floorf(in_x); 168 const int right_x_index = ceilf(in_x); 169 const float x_lerp = in_x - left_x_index; 170 171 const float dtop = (1 - y_lerp) * grads_ptr[out_idx]; 172 CudaAtomicAdd( 173 grads_image_ptr + 174 ((b_in * image_height + top_y_index) * image_width + left_x_index) * 175 depth + 176 d, 177 static_cast<T>((1 - x_lerp) * dtop)); 178 CudaAtomicAdd(grads_image_ptr + 179 ((b_in * image_height + top_y_index) * image_width + 180 right_x_index) * 181 depth + 182 d, 183 static_cast<T>(x_lerp * dtop)); 184 185 const float dbottom = y_lerp * grads_ptr[out_idx]; 186 CudaAtomicAdd(grads_image_ptr + 187 ((b_in * image_height + bottom_y_index) * image_width + 188 left_x_index) * 189 depth + 190 d, 191 static_cast<T>((1 - x_lerp) * dbottom)); 192 CudaAtomicAdd(grads_image_ptr + 193 ((b_in * image_height + bottom_y_index) * image_width + 194 right_x_index) * 195 depth + 196 d, 197 static_cast<T>(x_lerp * dbottom)); 198 } 199 } 200 201 template <typename T> 202 __global__ void CropAndResizeBackpropBoxesKernel( 203 const int32 nthreads, const float* grads_ptr, const T* image_ptr, 204 const float* boxes_ptr, const int32* box_ind_ptr, int num_boxes, int batch, 205 int image_height, int image_width, int crop_height, int crop_width, 206 int depth, float* grads_boxes_ptr) { 207 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 208 // out_idx = d + depth * (w + crop_width * (h + crop_height * b)) 209 int idx = out_idx; 210 const int d = idx % depth; 211 idx /= depth; 212 const int x = idx % crop_width; 213 idx /= crop_width; 214 const int y = idx % crop_height; 215 const int b = idx / crop_height; 216 217 const float y1 = boxes_ptr[b * 4]; 218 const float x1 = boxes_ptr[b * 4 + 1]; 219 const float y2 = boxes_ptr[b * 4 + 2]; 220 const float x2 = boxes_ptr[b * 4 + 3]; 221 222 const int32 b_in = box_ind_ptr[b]; 223 if (b_in < 0 || b_in >= batch) { 224 continue; 225 } 226 227 const float height_ratio = 228 (crop_height > 1) 229 ? static_cast<float>(image_height - 1) / (crop_height - 1) 230 : 0; 231 const float width_ratio = 232 (crop_width > 1) 233 ? static_cast<float>(image_width - 1) / (crop_width - 1) 234 : 0; 235 236 const float height_scale = (crop_height > 1) ? (y2 - y1) * height_ratio : 0; 237 const float width_scale = (crop_width > 1) ? (x2 - x1) * width_ratio : 0; 238 239 const float in_y = (crop_height > 1) 240 ? y1 * (image_height - 1) + y * height_scale 241 : 0.5 * (y1 + y2) * (image_height - 1); 242 if (in_y < 0 || in_y > image_height - 1) { 243 continue; 244 } 245 246 const float in_x = (crop_width > 1) 247 ? x1 * (image_width - 1) + x * width_scale 248 : 0.5 * (x1 + x2) * (image_width - 1); 249 if (in_x < 0 || in_x > image_width - 1) { 250 continue; 251 } 252 253 const int top_y_index = floorf(in_y); 254 const int bottom_y_index = ceilf(in_y); 255 const float y_lerp = in_y - top_y_index; 256 257 const int left_x_index = floorf(in_x); 258 const int right_x_index = ceilf(in_x); 259 const float x_lerp = in_x - left_x_index; 260 261 const float top_left(static_cast<float>( 262 image_ptr[((b_in * image_height + top_y_index) * image_width + 263 left_x_index) * 264 depth + 265 d])); 266 const float top_right(static_cast<float>( 267 image_ptr[((b_in * image_height + top_y_index) * image_width + 268 right_x_index) * 269 depth + 270 d])); 271 const float bottom_left(static_cast<float>( 272 image_ptr[((b_in * image_height + bottom_y_index) * image_width + 273 left_x_index) * 274 depth + 275 d])); 276 const float bottom_right(static_cast<float>( 277 image_ptr[((b_in * image_height + bottom_y_index) * image_width + 278 right_x_index) * 279 depth + 280 d])); 281 282 // Compute the image gradient. 283 float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) + 284 x_lerp * (bottom_right - top_right); 285 float image_grad_x = (1 - y_lerp) * (top_right - top_left) + 286 y_lerp * (bottom_right - bottom_left); 287 // Modulate the image gradient with the incoming gradient. 288 const float top_grad = grads_ptr[out_idx]; 289 image_grad_y *= top_grad; 290 image_grad_x *= top_grad; 291 292 float dy1, dy2; 293 if (crop_height > 1) { 294 dy1 = image_grad_y * (image_height - 1 - y * height_ratio); 295 dy2 = image_grad_y * (y * height_ratio); 296 } else { 297 dy1 = image_grad_y * 0.5 * (image_height - 1); 298 dy2 = image_grad_y * 0.5 * (image_height - 1); 299 } 300 301 float dx1, dx2; 302 if (crop_width > 1) { 303 dx1 = image_grad_x * (image_width - 1 - x * width_ratio); 304 dx2 = image_grad_x * (x * width_ratio); 305 } else { 306 dx1 = image_grad_x * 0.5 * (image_width - 1); 307 dx2 = image_grad_x * 0.5 * (image_width - 1); 308 } 309 310 CudaAtomicAdd(grads_boxes_ptr + b * 4 + 0, dy1); 311 CudaAtomicAdd(grads_boxes_ptr + b * 4 + 1, dx1); 312 CudaAtomicAdd(grads_boxes_ptr + b * 4 + 2, dy2); 313 CudaAtomicAdd(grads_boxes_ptr + b * 4 + 3, dx2); 314 } 315 } 316 317 } // namespace 318 319 namespace functor { 320 321 template <typename T> 322 struct CropAndResize<GPUDevice, T> { 323 bool operator()(const OpKernelContext* context, 324 typename TTypes<T, 4>::ConstTensor image, 325 typename TTypes<float, 2>::ConstTensor boxes, 326 typename TTypes<int32, 1>::ConstTensor box_ind, 327 float extrapolation_value, 328 typename TTypes<float, 4>::Tensor crops) { 329 const int batch = image.dimension(0); 330 const int image_height = image.dimension(1); 331 const int image_width = image.dimension(2); 332 333 const int num_boxes = crops.dimension(0); 334 const int crop_height = crops.dimension(1); 335 const int crop_width = crops.dimension(2); 336 const int depth = crops.dimension(3); 337 338 const int total_count = num_boxes * crop_height * crop_width * depth; 339 const GPUDevice& d = context->eigen_device<GPUDevice>(); 340 341 if (total_count > 0) { 342 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); 343 CropAndResizeKernel<<<config.block_count, config.thread_per_block, 0, 344 d.stream()>>>( 345 config.virtual_thread_count, image.data(), boxes.data(), 346 box_ind.data(), num_boxes, batch, image_height, image_width, 347 crop_height, crop_width, depth, extrapolation_value, crops.data()); 348 } 349 return d.ok(); 350 } 351 }; 352 353 template <typename T> 354 struct CropAndResizeBackpropImage<GPUDevice, T> { 355 bool operator()(const GPUDevice& d, 356 typename TTypes<float, 4>::ConstTensor grads, 357 typename TTypes<float, 2>::ConstTensor boxes, 358 typename TTypes<int32, 1>::ConstTensor box_ind, 359 typename TTypes<T, 4>::Tensor grads_image) { 360 const int batch = grads_image.dimension(0); 361 const int image_height = grads_image.dimension(1); 362 const int image_width = grads_image.dimension(2); 363 364 const int num_boxes = grads.dimension(0); 365 const int crop_height = grads.dimension(1); 366 const int crop_width = grads.dimension(2); 367 const int depth = grads.dimension(3); 368 369 int total_count; 370 CudaLaunchConfig config; 371 372 // Initialize grads_image with all zeros. 373 total_count = batch * image_height * image_width * depth; 374 if (total_count > 0) { 375 config = GetCudaLaunchConfig(total_count, d); 376 SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 377 config.virtual_thread_count, grads_image.data()); 378 } 379 380 // Accumulate. 381 total_count = num_boxes * crop_height * crop_width * depth; 382 if (total_count > 0) { 383 config = GetCudaLaunchConfig(total_count, d); 384 CropAndResizeBackpropImageKernel<<< 385 config.block_count, config.thread_per_block, 0, d.stream()>>>( 386 config.virtual_thread_count, grads.data(), boxes.data(), 387 box_ind.data(), num_boxes, batch, image_height, image_width, 388 crop_height, crop_width, depth, grads_image.data()); 389 } 390 return d.ok(); 391 } 392 }; 393 394 template <typename T> 395 struct CropAndResizeBackpropBoxes<GPUDevice, T> { 396 bool operator()(const GPUDevice& d, 397 typename TTypes<float, 4>::ConstTensor grads, 398 typename TTypes<T, 4>::ConstTensor image, 399 typename TTypes<float, 2>::ConstTensor boxes, 400 typename TTypes<int32, 1>::ConstTensor box_ind, 401 typename TTypes<float, 2>::Tensor grads_boxes) { 402 const int batch = image.dimension(0); 403 const int image_height = image.dimension(1); 404 const int image_width = image.dimension(2); 405 406 const int num_boxes = grads.dimension(0); 407 const int crop_height = grads.dimension(1); 408 const int crop_width = grads.dimension(2); 409 const int depth = grads.dimension(3); 410 411 int total_count; 412 CudaLaunchConfig config; 413 414 // Initialize grads_boxes with all zeros. 415 total_count = num_boxes * 4; 416 if (total_count > 0) { 417 config = GetCudaLaunchConfig(total_count, d); 418 SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 419 config.virtual_thread_count, grads_boxes.data()); 420 } 421 422 // Accumulate. 423 total_count = num_boxes * crop_height * crop_width * depth; 424 if (total_count > 0) { 425 config = GetCudaLaunchConfig(total_count, d); 426 CropAndResizeBackpropBoxesKernel<<< 427 config.block_count, config.thread_per_block, 0, d.stream()>>>( 428 config.virtual_thread_count, grads.data(), image.data(), boxes.data(), 429 box_ind.data(), num_boxes, batch, image_height, image_width, 430 crop_height, crop_width, depth, grads_boxes.data()); 431 } 432 return d.ok(); 433 } 434 }; 435 436 #define DEFINE_GPU_SPECS(T) \ 437 template struct CropAndResize<GPUDevice, T>; \ 438 template struct CropAndResizeBackpropImage<GPUDevice, T>; \ 439 template struct CropAndResizeBackpropBoxes<GPUDevice, T>; 440 441 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); 442 443 #undef DEFINE_GPU_SPECS 444 445 template struct CheckValidBoxIndexHelper<GPUDevice>; 446 447 } // namespace functor 448 } // namespace tensorflow 449 450 #endif // GOOGLE_CUDA 451