1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/array_ops.cc. 17 18 #define EIGEN_USE_THREADS 19 20 #if GOOGLE_CUDA 21 #define EIGEN_USE_GPU 22 #endif // GOOGLE_CUDA 23 24 #include <numeric> 25 26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 27 #include "tensorflow/core/framework/bounds_check.h" 28 #include "tensorflow/core/framework/op_kernel.h" 29 #include "tensorflow/core/framework/register_types.h" 30 #include "tensorflow/core/framework/tensor.h" 31 #include "tensorflow/core/kernels/ops_util.h" 32 #include "tensorflow/core/kernels/split_lib.h" 33 #include "tensorflow/core/lib/core/status.h" 34 #include "tensorflow/core/lib/gtl/array_slice.h" 35 #include "tensorflow/core/util/work_sharder.h" 36 #if GOOGLE_CUDA 37 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 38 #include "tensorflow/core/kernels/gpu_device_array.h" 39 #include "tensorflow/core/kernels/split_lib_gpu.h" 40 #include "tensorflow/core/platform/stream_executor.h" 41 #endif // GOOGLE_CUDA 42 43 namespace tensorflow { 44 45 typedef Eigen::ThreadPoolDevice CPUDevice; 46 typedef Eigen::GpuDevice GPUDevice; 47 48 template <typename Device, typename T, typename Tlen> 49 class SplitVOpBase : public OpKernel { 50 public: 51 explicit SplitVOpBase(OpKernelConstruction* c) : OpKernel(c) {} 52 53 void ComputeEasyCases(OpKernelContext* context, bool* done, 54 std::vector<Tlen>* split_sizes_vec) { 55 const int32 num_split = context->num_outputs(); 56 const Tensor& input = context->input(0); 57 const TensorShape& input_shape = input.shape(); 58 const Tensor& split_tensor = context->input(1); 59 const Tensor& split_dim_tensor = context->input(2); 60 61 OP_REQUIRES(context, split_dim_tensor.NumElements() == 1, 62 errors::InvalidArgument("split_dim_tensor must have " 63 "exactly one element.")); 64 65 const int32 split_dim_orig = split_dim_tensor.flat<int32>()(0); 66 const int32 split_dim = 67 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 68 69 OP_REQUIRES( 70 context, 71 split_tensor.dims() == 1 && split_tensor.NumElements() == num_split, 72 errors::InvalidArgument("size of the split_tensor must be 1-D and have " 73 "the same elements as outputs got ", 74 split_tensor.dims(), " -D and ", 75 split_tensor.NumElements(), " elements")); 76 77 auto split_sizes_d = split_tensor.vec<Tlen>(); 78 79 split_sizes_vec->resize(split_sizes_d.size()); 80 81 std::copy(split_sizes_d.data(), split_sizes_d.data() + split_sizes_d.size(), 82 split_sizes_vec->begin()); 83 84 OP_REQUIRES( 85 context, num_split > 0, 86 errors::InvalidArgument( 87 "Number of ways to split should be > 0, but got ", num_split)); 88 89 OP_REQUIRES( 90 context, 0 <= split_dim && split_dim < input.dims(), 91 errors::InvalidArgument("-input rank(-", input.dims(), 92 ") <= split_dim < input rank (", input.dims(), 93 "), but got ", split_dim_orig)); 94 95 Tlen input_size_split_dim = input_shape.dim_size(split_dim); 96 97 // Special case 1: num_split == 1. Nothing to do. 98 if (num_split == 1) { 99 context->set_output(0, context->input(0)); 100 OP_REQUIRES( 101 context, (*split_sizes_vec)[0] == input_size_split_dim, 102 errors::InvalidArgument("If there is only one output, it must have " 103 "the same size as the input. Input size: ", 104 input_size_split_dim, 105 " output size: ", (*split_sizes_vec)[0])); 106 *done = true; 107 return; 108 } 109 110 // Determine sizes of output, in case of a -1 input value 111 int neg_one_dim = -1; 112 Tlen determined_size = 0; 113 for (int d = 0; d < split_sizes_vec->size(); ++d) { 114 Tlen size = (*split_sizes_vec)[d]; 115 116 if (size == -1) { 117 OP_REQUIRES(context, neg_one_dim == -1, 118 errors::InvalidArgument("There can only be one -1 in the " 119 "input.")); 120 neg_one_dim = d; 121 } else { 122 determined_size += size; 123 } 124 } 125 126 OP_REQUIRES( 127 context, 128 (neg_one_dim == -1 && determined_size == input_size_split_dim) || 129 (neg_one_dim >= 0 && determined_size <= input_size_split_dim), 130 errors::InvalidArgument("Determined shape must either match " 131 "input shape along split_dim exactly if " 132 "fully specified, or be less than the size of " 133 "the input along split_dim if not fully " 134 "specified. Got: ", 135 determined_size)); 136 137 if (neg_one_dim >= 0) { 138 (*split_sizes_vec)[neg_one_dim] = input_size_split_dim - determined_size; 139 } 140 141 // Special case 2: split along the 1st dimension. We can share the 142 // underlying buffer. 143 // 144 // Apply this optimization conservatively: if input is aligned, 145 // the resulting tensors must be aligned. It's conservative 146 // because if the immediate consumer of the resulting tensors are 147 // not using eigen for computation, its perfectly fine to avoid 148 // the copying. 149 if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) { 150 Tlen start = 0; 151 for (int i = 0; i < num_split; ++i) { 152 context->set_output(i, 153 input.Slice(start, start + (*split_sizes_vec)[i])); 154 start += (*split_sizes_vec)[i]; 155 } 156 *done = true; 157 return; 158 } 159 } 160 161 template <typename IndexType> 162 std::tuple<IndexType, IndexType, IndexType> SetDims( 163 const TensorShape& input_shape, const int32 split_dim) const { 164 static_assert(std::is_integral<IndexType>::value, 165 "IndexType must be an integer type"); 166 int32 prefix_dim_size = 1; 167 for (int i = 0; i < split_dim; ++i) { 168 prefix_dim_size *= input_shape.dim_size(i); 169 } 170 171 // Caller must ensure that dim_size and suffix_dim_size are < 172 // std::numeric_limits<IndexType>::max() 173 IndexType split_dim_size = 174 static_cast<IndexType>(input_shape.dim_size(split_dim)); 175 176 IndexType suffix_dim_size = 1; 177 for (int i = split_dim + 1; i < input_shape.dims(); ++i) { 178 suffix_dim_size *= static_cast<IndexType>(input_shape.dim_size(i)); 179 } 180 return std::make_tuple(prefix_dim_size, split_dim_size, suffix_dim_size); 181 } 182 }; 183 184 template <typename T, typename Tlen, typename InputReshapedType, int NDims> 185 class SplitVOpCPUImpl { 186 public: 187 template <typename MakeSizesType, typename ReshapeResultType> 188 void operator()(OpKernelContext* context, 189 const InputReshapedType& input_reshaped, 190 const std::vector<int64>& split_start_points, 191 const TensorShape& input_shape, int32 split_dim, 192 Eigen::DenseIndex prefix_dim_size, 193 Eigen::DenseIndex split_dim_size, 194 Eigen::DenseIndex suffix_dim_size, 195 std::vector<Tlen>& split_sizes_vec, 196 const MakeSizesType& make_sizes, 197 const ReshapeResultType& reshape_result) const { 198 Eigen::DSizes<Eigen::DenseIndex, NDims> indices; 199 for (int i = 0; i < NDims; ++i) { 200 indices[i] = 0; 201 } 202 const auto num_threads = 203 context->device()->tensorflow_cpu_worker_threads()->num_threads; 204 // TODO(jewillco): Tune heuristic further. 205 const auto input_element_count = input_shape.num_elements(); 206 const int num_split = split_start_points.size(); 207 const bool use_parallelism_between_outputs = 208 (num_split >= 4 && 209 input_element_count >= std::max(num_threads, num_split) * 4096 && 210 input_element_count < num_split * 180 * 1024); 211 212 auto range_output_func = [&indices, context, &input_shape, split_dim, 213 &split_sizes_vec, &split_start_points, 214 use_parallelism_between_outputs, &input_reshaped, 215 &make_sizes, 216 &reshape_result](int64 start, int64 limit) { 217 for (int64 i = start; i < limit; ++i) { 218 TensorShape output_shape(input_shape); 219 output_shape.set_dim(split_dim, split_sizes_vec[i]); 220 Tensor* result = nullptr; 221 OP_REQUIRES_OK(context, 222 context->allocate_output(i, output_shape, &result)); 223 224 const auto sizes = make_sizes(split_sizes_vec[i]); 225 226 if (sizes.TotalSize() > 0) { 227 auto result_shaped = reshape_result(result, split_sizes_vec[i]); 228 229 auto current_indices = indices; 230 current_indices[NDims - 2] = split_start_points[i]; 231 if (use_parallelism_between_outputs) { 232 // Use sequential implementation for single output. 233 result_shaped = input_reshaped.slice(current_indices, sizes); 234 } else { 235 // This implementation may be parallel internally. 236 functor::Split<CPUDevice, T, NDims>()( 237 context->eigen_device<CPUDevice>(), result_shaped, 238 input_reshaped, current_indices, sizes); 239 } 240 } 241 } 242 }; 243 if (use_parallelism_between_outputs) { 244 // Run in parallel, disabling parallelism in functor. 245 Shard(num_split, 246 context->device()->tensorflow_cpu_worker_threads()->workers, 247 num_split, input_element_count / num_split, range_output_func); 248 } else { 249 // Run sequentially, but allow internal parallelism in functor. 250 range_output_func(0, num_split); 251 } 252 } 253 }; 254 255 template <typename T, typename Tlen> 256 class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> { 257 public: 258 typedef SplitVOpBase<CPUDevice, T, Tlen> Base; 259 explicit SplitVOpCPU(OpKernelConstruction* c) : Base(c) {} 260 261 void Compute(OpKernelContext* context) override { 262 bool done = false; 263 std::vector<Tlen> split_sizes_vec; 264 Base::ComputeEasyCases(context, &done, &split_sizes_vec); 265 if (!context->status().ok() || done) { 266 return; 267 } 268 const int32 num_split = Base::num_outputs(); 269 const Tensor& input = context->input(0); 270 const TensorShape& input_shape = input.shape(); 271 const int32 split_dim_orig = context->input(2).flat<int32>()(0); 272 const int32 split_dim = 273 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 274 275 // Android also uses int32 indexing, so check here also. 276 OP_REQUIRES( 277 context, 278 FastBoundsCheck(input.NumElements(), 279 std::numeric_limits<Eigen::DenseIndex>::max()), 280 errors::InvalidArgument("Split requires input size < ", 281 std::numeric_limits<Eigen::DenseIndex>::max())); 282 283 Eigen::DenseIndex prefix_dim_size; 284 Eigen::DenseIndex split_dim_size; 285 Eigen::DenseIndex suffix_dim_size; 286 287 std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) = 288 Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim); 289 std::vector<int64> split_start_points(num_split); 290 for (int i = 0; i < num_split; ++i) { 291 if (i == 0) { 292 split_start_points[i] = 0; 293 } else { 294 split_start_points[i] = 295 split_start_points[i - 1] + split_sizes_vec[i - 1]; 296 } 297 } 298 299 if (prefix_dim_size == 1) { 300 auto input_reshaped = 301 input.shaped<T, 2>({split_dim_size, suffix_dim_size}); 302 auto make_sizes = [&](Eigen::DenseIndex split_size) { 303 return Eigen::DSizes<Eigen::DenseIndex, 2>{split_size, suffix_dim_size}; 304 }; 305 auto reshape_result = [&](Tensor* result, Tlen split_size) { 306 return result->shaped<T, 2>({split_size, suffix_dim_size}); 307 }; 308 SplitVOpCPUImpl<T, Tlen, decltype(input_reshaped), 2>{}( 309 context, input_reshaped, split_start_points, input_shape, split_dim, 310 prefix_dim_size, split_dim_size, suffix_dim_size, split_sizes_vec, 311 make_sizes, reshape_result); 312 } else { 313 auto input_reshaped = input.shaped<T, 3>( 314 {prefix_dim_size, split_dim_size, suffix_dim_size}); 315 auto make_sizes = [&](Eigen::DenseIndex split_size) { 316 return Eigen::DSizes<Eigen::DenseIndex, 3>{prefix_dim_size, split_size, 317 suffix_dim_size}; 318 }; 319 auto reshape_result = [&](Tensor* result, Tlen split_size) { 320 return result->shaped<T, 3>( 321 {prefix_dim_size, split_size, suffix_dim_size}); 322 }; 323 SplitVOpCPUImpl<T, Tlen, decltype(input_reshaped), 3>{}( 324 context, input_reshaped, split_start_points, input_shape, split_dim, 325 prefix_dim_size, split_dim_size, suffix_dim_size, split_sizes_vec, 326 make_sizes, reshape_result); 327 } 328 } 329 }; 330 331 #if GOOGLE_CUDA 332 333 // Partial specialization for GPU 334 template <typename T, typename Tlen> 335 class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> { 336 public: 337 typedef SplitVOpBase<GPUDevice, T, Tlen> Base; 338 explicit SplitVOpGPU(OpKernelConstruction* c) : Base(c) {} 339 340 void Compute(OpKernelContext* context) override { 341 bool done = false; 342 std::vector<Tlen> split_sizes_vec; 343 Base::ComputeEasyCases(context, &done, &split_sizes_vec); 344 if (!context->status().ok() || done) { 345 return; 346 } 347 const int32 num_split = Base::num_outputs(); 348 const Tensor& input = context->input(0); 349 const TensorShape& input_shape = input.shape(); 350 const int32 split_dim_orig = context->input(2).flat<int32>()(0); 351 const int32 split_dim = 352 split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; 353 OP_REQUIRES( 354 context, 355 FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()), 356 errors::InvalidArgument("Split on GPU requires input size " 357 "< max int32")); 358 359 int32 prefix_dim_size; 360 int32 split_dim_size; 361 int32 suffix_dim_size; 362 std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) = 363 Base::template SetDims<int32>(input_shape, split_dim); 364 365 // use the same approach as concat (see documentation there) 366 // reshape to 2D 367 368 if (num_split > 16) { 369 GpuDeviceArrayOnHost<T*> ptrs(context, num_split); 370 OP_REQUIRES_OK(context, ptrs.Init()); 371 372 GpuDeviceArrayOnHost<Tlen> offsets(context, num_split + 1); 373 OP_REQUIRES_OK(context, offsets.Init()); 374 375 Tlen offset = 0; 376 int entry = split_sizes_vec[0]; 377 bool fixed_size = 378 std::all_of(split_sizes_vec.begin(), split_sizes_vec.end(), 379 [&entry](int n) { return n == entry; }); 380 381 for (int i = 0; i < num_split; ++i) { 382 TensorShape output_shape(input_shape); 383 output_shape.set_dim(split_dim, split_sizes_vec[i]); 384 Tensor* result = nullptr; 385 OP_REQUIRES_OK(context, 386 context->allocate_output(i, output_shape, &result)); 387 ptrs.Set(i, result->flat<T>().data()); 388 offsets.Set(i, offset); 389 offset += split_sizes_vec[i] * suffix_dim_size; 390 } 391 offsets.Set(num_split, offset); 392 OP_REQUIRES_OK(context, ptrs.Finalize()); 393 OP_REQUIRES_OK(context, offsets.Finalize()); 394 395 if (input.NumElements() > 0) { 396 SplitVOpGPULaunch<T, Tlen>().Run( 397 context->eigen_device<GPUDevice>(), fixed_size, 398 input.flat<T>().data(), prefix_dim_size, 399 input.NumElements() / prefix_dim_size, offsets.data(), ptrs.data()); 400 OP_REQUIRES( 401 context, context->op_device_context()->stream()->ok(), 402 errors::Internal("Launch of gpu kernel for SplitVOp failed")); 403 } 404 } else { 405 Eigen::DenseIndex prefix_dim_size; 406 Eigen::DenseIndex split_dim_size; 407 Eigen::DenseIndex suffix_dim_size; 408 409 std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) = 410 Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim); 411 auto input_reshaped = input.shaped<T, 2>( 412 {prefix_dim_size, split_dim_size * suffix_dim_size}); 413 414 Eigen::DSizes<Eigen::DenseIndex, 2> indices{0, 0}; 415 416 for (int i = 0; i < num_split; ++i) { 417 TensorShape output_shape(input_shape); 418 output_shape.set_dim(split_dim, split_sizes_vec[i]); 419 Tensor* result = nullptr; 420 OP_REQUIRES_OK(context, 421 context->allocate_output(i, output_shape, &result)); 422 423 Eigen::DSizes<Eigen::DenseIndex, 2> sizes{ 424 prefix_dim_size, split_sizes_vec[i] * suffix_dim_size}; 425 426 if (sizes.TotalSize() > 0) { 427 auto result_shaped = result->shaped<T, 2>( 428 {prefix_dim_size, split_sizes_vec[i] * suffix_dim_size}); 429 430 functor::SplitCustom<GPUDevice, T>()( 431 context->eigen_device<GPUDevice>(), result_shaped, input_reshaped, 432 indices, sizes); 433 } 434 indices[1] += split_sizes_vec[i] * suffix_dim_size; 435 } 436 } 437 } 438 }; 439 #endif // GOOGLE_CUDA 440 441 #define REGISTER_SPLIT(type, len_type) \ 442 REGISTER_KERNEL_BUILDER(Name("SplitV") \ 443 .Device(DEVICE_CPU) \ 444 .TypeConstraint<len_type>("Tlen") \ 445 .TypeConstraint<type>("T") \ 446 .HostMemory("size_splits") \ 447 .HostMemory("split_dim"), \ 448 SplitVOpCPU<type, len_type>); 449 450 #define REGISTER_SPLIT_LEN(type) \ 451 REGISTER_SPLIT(type, int32); \ 452 REGISTER_SPLIT(type, int64); 453 454 TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN); 455 456 #undef REGISTER_SPLIT_LEN 457 #undef REGISTER_SPLIT 458 459 #if GOOGLE_CUDA 460 461 #define REGISTER_GPU(type, len_type) \ 462 REGISTER_KERNEL_BUILDER(Name("SplitV") \ 463 .Device(DEVICE_GPU) \ 464 .TypeConstraint<len_type>("Tlen") \ 465 .TypeConstraint<type>("T") \ 466 .HostMemory("size_splits") \ 467 .HostMemory("split_dim"), \ 468 SplitVOpGPU<type, len_type>); 469 470 #define REGISTER_GPU_LEN(type) \ 471 REGISTER_GPU(type, int32); \ 472 REGISTER_GPU(type, int64); 473 474 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_LEN); 475 TF_CALL_complex64(REGISTER_GPU_LEN); 476 TF_CALL_complex128(REGISTER_GPU_LEN); 477 REGISTER_GPU_LEN(bfloat16); 478 #undef REGISTER_GPU_LEN 479 #undef REGISTER_GPU 480 481 // special GPU kernel for int32 482 483 #define REGISTER_GPU_int32(len_type) \ 484 REGISTER_KERNEL_BUILDER(Name("SplitV") \ 485 .Device(DEVICE_GPU) \ 486 .TypeConstraint<int32>("T") \ 487 .TypeConstraint<len_type>("Tlen") \ 488 .HostMemory("size_splits") \ 489 .HostMemory("split_dim") \ 490 .HostMemory("value") \ 491 .HostMemory("output"), \ 492 SplitVOpCPU<int32, len_type>); 493 494 REGISTER_GPU_int32(int32); 495 REGISTER_GPU_int32(int64); 496 497 #undef REGISTER_GPU_int32 498 499 #endif // GOOGLE_CUDA 500 501 } // end namespace tensorflow 502