Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/array_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #if GOOGLE_CUDA
     21 #define EIGEN_USE_GPU
     22 #endif  // GOOGLE_CUDA
     23 
     24 #include <numeric>
     25 
     26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     27 #include "tensorflow/core/framework/bounds_check.h"
     28 #include "tensorflow/core/framework/op_kernel.h"
     29 #include "tensorflow/core/framework/register_types.h"
     30 #include "tensorflow/core/framework/tensor.h"
     31 #include "tensorflow/core/kernels/ops_util.h"
     32 #include "tensorflow/core/kernels/split_lib.h"
     33 #include "tensorflow/core/lib/core/status.h"
     34 #include "tensorflow/core/lib/gtl/array_slice.h"
     35 #include "tensorflow/core/util/work_sharder.h"
     36 #if GOOGLE_CUDA
     37 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
     38 #include "tensorflow/core/kernels/gpu_device_array.h"
     39 #include "tensorflow/core/kernels/split_lib_gpu.h"
     40 #include "tensorflow/core/platform/stream_executor.h"
     41 #endif  // GOOGLE_CUDA
     42 
     43 namespace tensorflow {
     44 
     45 typedef Eigen::ThreadPoolDevice CPUDevice;
     46 typedef Eigen::GpuDevice GPUDevice;
     47 
     48 template <typename Device, typename T, typename Tlen>
     49 class SplitVOpBase : public OpKernel {
     50  public:
     51   explicit SplitVOpBase(OpKernelConstruction* c) : OpKernel(c) {}
     52 
     53   void ComputeEasyCases(OpKernelContext* context, bool* done,
     54                         std::vector<Tlen>* split_sizes_vec) {
     55     const int32 num_split = context->num_outputs();
     56     const Tensor& input = context->input(0);
     57     const TensorShape& input_shape = input.shape();
     58     const Tensor& split_tensor = context->input(1);
     59     const Tensor& split_dim_tensor = context->input(2);
     60 
     61     OP_REQUIRES(context, split_dim_tensor.NumElements() == 1,
     62                 errors::InvalidArgument("split_dim_tensor must have "
     63                                         "exactly one element."));
     64 
     65     const int32 split_dim_orig = split_dim_tensor.flat<int32>()(0);
     66     const int32 split_dim =
     67         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     68 
     69     OP_REQUIRES(
     70         context,
     71         split_tensor.dims() == 1 && split_tensor.NumElements() == num_split,
     72         errors::InvalidArgument("size of the split_tensor must be 1-D and have "
     73                                 "the same elements as outputs got ",
     74                                 split_tensor.dims(), " -D and ",
     75                                 split_tensor.NumElements(), " elements"));
     76 
     77     auto split_sizes_d = split_tensor.vec<Tlen>();
     78 
     79     split_sizes_vec->resize(split_sizes_d.size());
     80 
     81     std::copy(split_sizes_d.data(), split_sizes_d.data() + split_sizes_d.size(),
     82               split_sizes_vec->begin());
     83 
     84     OP_REQUIRES(
     85         context, num_split > 0,
     86         errors::InvalidArgument(
     87             "Number of ways to split should be > 0, but got ", num_split));
     88 
     89     OP_REQUIRES(
     90         context, 0 <= split_dim && split_dim < input.dims(),
     91         errors::InvalidArgument("-input rank(-", input.dims(),
     92                                 ") <= split_dim < input rank (", input.dims(),
     93                                 "), but got ", split_dim_orig));
     94 
     95     Tlen input_size_split_dim = input_shape.dim_size(split_dim);
     96 
     97     // Special case 1: num_split == 1. Nothing to do.
     98     if (num_split == 1) {
     99       context->set_output(0, context->input(0));
    100       OP_REQUIRES(
    101           context, (*split_sizes_vec)[0] == input_size_split_dim,
    102           errors::InvalidArgument("If there is only one output, it must have "
    103                                   "the same size as the input. Input size: ",
    104                                   input_size_split_dim,
    105                                   " output size: ", (*split_sizes_vec)[0]));
    106       *done = true;
    107       return;
    108     }
    109 
    110     // Determine sizes of output, in case of a -1 input value
    111     int neg_one_dim = -1;
    112     Tlen determined_size = 0;
    113     for (int d = 0; d < split_sizes_vec->size(); ++d) {
    114       Tlen size = (*split_sizes_vec)[d];
    115 
    116       if (size == -1) {
    117         OP_REQUIRES(context, neg_one_dim == -1,
    118                     errors::InvalidArgument("There can only be one -1 in the "
    119                                             "input."));
    120         neg_one_dim = d;
    121       } else {
    122         determined_size += size;
    123       }
    124     }
    125 
    126     OP_REQUIRES(
    127         context,
    128         (neg_one_dim == -1 && determined_size == input_size_split_dim) ||
    129             (neg_one_dim >= 0 && determined_size <= input_size_split_dim),
    130         errors::InvalidArgument("Determined shape must either match "
    131                                 "input shape along split_dim exactly if "
    132                                 "fully specified, or be less than the size of "
    133                                 "the input along split_dim if not fully "
    134                                 "specified.  Got: ",
    135                                 determined_size));
    136 
    137     if (neg_one_dim >= 0) {
    138       (*split_sizes_vec)[neg_one_dim] = input_size_split_dim - determined_size;
    139     }
    140 
    141     // Special case 2: split along the 1st dimension. We can share the
    142     // underlying buffer.
    143     //
    144     // Apply this optimization conservatively: if input is aligned,
    145     // the resulting tensors must be aligned. It's conservative
    146     // because if the immediate consumer of the resulting tensors are
    147     // not using eigen for computation, its perfectly fine to avoid
    148     // the copying.
    149     if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) {
    150       Tlen start = 0;
    151       for (int i = 0; i < num_split; ++i) {
    152         context->set_output(i,
    153                             input.Slice(start, start + (*split_sizes_vec)[i]));
    154         start += (*split_sizes_vec)[i];
    155       }
    156       *done = true;
    157       return;
    158     }
    159   }
    160 
    161   template <typename IndexType>
    162   std::tuple<IndexType, IndexType, IndexType> SetDims(
    163       const TensorShape& input_shape, const int32 split_dim) const {
    164     static_assert(std::is_integral<IndexType>::value,
    165                   "IndexType must be an integer type");
    166     int32 prefix_dim_size = 1;
    167     for (int i = 0; i < split_dim; ++i) {
    168       prefix_dim_size *= input_shape.dim_size(i);
    169     }
    170 
    171     // Caller must ensure that dim_size and suffix_dim_size are <
    172     // std::numeric_limits<IndexType>::max()
    173     IndexType split_dim_size =
    174         static_cast<IndexType>(input_shape.dim_size(split_dim));
    175 
    176     IndexType suffix_dim_size = 1;
    177     for (int i = split_dim + 1; i < input_shape.dims(); ++i) {
    178       suffix_dim_size *= static_cast<IndexType>(input_shape.dim_size(i));
    179     }
    180     return std::make_tuple(prefix_dim_size, split_dim_size, suffix_dim_size);
    181   }
    182 };
    183 
    184 template <typename T, typename Tlen, typename InputReshapedType, int NDims>
    185 class SplitVOpCPUImpl {
    186  public:
    187   template <typename MakeSizesType, typename ReshapeResultType>
    188   void operator()(OpKernelContext* context,
    189                   const InputReshapedType& input_reshaped,
    190                   const std::vector<int64>& split_start_points,
    191                   const TensorShape& input_shape, int32 split_dim,
    192                   Eigen::DenseIndex prefix_dim_size,
    193                   Eigen::DenseIndex split_dim_size,
    194                   Eigen::DenseIndex suffix_dim_size,
    195                   std::vector<Tlen>& split_sizes_vec,
    196                   const MakeSizesType& make_sizes,
    197                   const ReshapeResultType& reshape_result) const {
    198     Eigen::DSizes<Eigen::DenseIndex, NDims> indices;
    199     for (int i = 0; i < NDims; ++i) {
    200       indices[i] = 0;
    201     }
    202     const auto num_threads =
    203         context->device()->tensorflow_cpu_worker_threads()->num_threads;
    204     // TODO(jewillco): Tune heuristic further.
    205     const auto input_element_count = input_shape.num_elements();
    206     const int num_split = split_start_points.size();
    207     const bool use_parallelism_between_outputs =
    208         (num_split >= 4 &&
    209          input_element_count >= std::max(num_threads, num_split) * 4096 &&
    210          input_element_count < num_split * 180 * 1024);
    211 
    212     auto range_output_func = [&indices, context, &input_shape, split_dim,
    213                               &split_sizes_vec, &split_start_points,
    214                               use_parallelism_between_outputs, &input_reshaped,
    215                               &make_sizes,
    216                               &reshape_result](int64 start, int64 limit) {
    217       for (int64 i = start; i < limit; ++i) {
    218         TensorShape output_shape(input_shape);
    219         output_shape.set_dim(split_dim, split_sizes_vec[i]);
    220         Tensor* result = nullptr;
    221         OP_REQUIRES_OK(context,
    222                        context->allocate_output(i, output_shape, &result));
    223 
    224         const auto sizes = make_sizes(split_sizes_vec[i]);
    225 
    226         if (sizes.TotalSize() > 0) {
    227           auto result_shaped = reshape_result(result, split_sizes_vec[i]);
    228 
    229           auto current_indices = indices;
    230           current_indices[NDims - 2] = split_start_points[i];
    231           if (use_parallelism_between_outputs) {
    232             // Use sequential implementation for single output.
    233             result_shaped = input_reshaped.slice(current_indices, sizes);
    234           } else {
    235             // This implementation may be parallel internally.
    236             functor::Split<CPUDevice, T, NDims>()(
    237                 context->eigen_device<CPUDevice>(), result_shaped,
    238                 input_reshaped, current_indices, sizes);
    239           }
    240         }
    241       }
    242     };
    243     if (use_parallelism_between_outputs) {
    244       // Run in parallel, disabling parallelism in functor.
    245       Shard(num_split,
    246             context->device()->tensorflow_cpu_worker_threads()->workers,
    247             num_split, input_element_count / num_split, range_output_func);
    248     } else {
    249       // Run sequentially, but allow internal parallelism in functor.
    250       range_output_func(0, num_split);
    251     }
    252   }
    253 };
    254 
    255 template <typename T, typename Tlen>
    256 class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
    257  public:
    258   typedef SplitVOpBase<CPUDevice, T, Tlen> Base;
    259   explicit SplitVOpCPU(OpKernelConstruction* c) : Base(c) {}
    260 
    261   void Compute(OpKernelContext* context) override {
    262     bool done = false;
    263     std::vector<Tlen> split_sizes_vec;
    264     Base::ComputeEasyCases(context, &done, &split_sizes_vec);
    265     if (!context->status().ok() || done) {
    266       return;
    267     }
    268     const int32 num_split = Base::num_outputs();
    269     const Tensor& input = context->input(0);
    270     const TensorShape& input_shape = input.shape();
    271     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
    272     const int32 split_dim =
    273         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
    274 
    275     // Android also uses int32 indexing, so check here also.
    276     OP_REQUIRES(
    277         context,
    278         FastBoundsCheck(input.NumElements(),
    279                         std::numeric_limits<Eigen::DenseIndex>::max()),
    280         errors::InvalidArgument("Split requires input size < ",
    281                                 std::numeric_limits<Eigen::DenseIndex>::max()));
    282 
    283     Eigen::DenseIndex prefix_dim_size;
    284     Eigen::DenseIndex split_dim_size;
    285     Eigen::DenseIndex suffix_dim_size;
    286 
    287     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
    288         Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
    289     std::vector<int64> split_start_points(num_split);
    290     for (int i = 0; i < num_split; ++i) {
    291       if (i == 0) {
    292         split_start_points[i] = 0;
    293       } else {
    294         split_start_points[i] =
    295             split_start_points[i - 1] + split_sizes_vec[i - 1];
    296       }
    297     }
    298 
    299     if (prefix_dim_size == 1) {
    300       auto input_reshaped =
    301           input.shaped<T, 2>({split_dim_size, suffix_dim_size});
    302       auto make_sizes = [&](Eigen::DenseIndex split_size) {
    303         return Eigen::DSizes<Eigen::DenseIndex, 2>{split_size, suffix_dim_size};
    304       };
    305       auto reshape_result = [&](Tensor* result, Tlen split_size) {
    306         return result->shaped<T, 2>({split_size, suffix_dim_size});
    307       };
    308       SplitVOpCPUImpl<T, Tlen, decltype(input_reshaped), 2>{}(
    309           context, input_reshaped, split_start_points, input_shape, split_dim,
    310           prefix_dim_size, split_dim_size, suffix_dim_size, split_sizes_vec,
    311           make_sizes, reshape_result);
    312     } else {
    313       auto input_reshaped = input.shaped<T, 3>(
    314           {prefix_dim_size, split_dim_size, suffix_dim_size});
    315       auto make_sizes = [&](Eigen::DenseIndex split_size) {
    316         return Eigen::DSizes<Eigen::DenseIndex, 3>{prefix_dim_size, split_size,
    317                                                    suffix_dim_size};
    318       };
    319       auto reshape_result = [&](Tensor* result, Tlen split_size) {
    320         return result->shaped<T, 3>(
    321             {prefix_dim_size, split_size, suffix_dim_size});
    322       };
    323       SplitVOpCPUImpl<T, Tlen, decltype(input_reshaped), 3>{}(
    324           context, input_reshaped, split_start_points, input_shape, split_dim,
    325           prefix_dim_size, split_dim_size, suffix_dim_size, split_sizes_vec,
    326           make_sizes, reshape_result);
    327     }
    328   }
    329 };
    330 
    331 #if GOOGLE_CUDA
    332 
    333 // Partial specialization for GPU
    334 template <typename T, typename Tlen>
    335 class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
    336  public:
    337   typedef SplitVOpBase<GPUDevice, T, Tlen> Base;
    338   explicit SplitVOpGPU(OpKernelConstruction* c) : Base(c) {}
    339 
    340   void Compute(OpKernelContext* context) override {
    341     bool done = false;
    342     std::vector<Tlen> split_sizes_vec;
    343     Base::ComputeEasyCases(context, &done, &split_sizes_vec);
    344     if (!context->status().ok() || done) {
    345       return;
    346     }
    347     const int32 num_split = Base::num_outputs();
    348     const Tensor& input = context->input(0);
    349     const TensorShape& input_shape = input.shape();
    350     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
    351     const int32 split_dim =
    352         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
    353     OP_REQUIRES(
    354         context,
    355         FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
    356         errors::InvalidArgument("Split on GPU requires input size "
    357                                 "< max int32"));
    358 
    359     int32 prefix_dim_size;
    360     int32 split_dim_size;
    361     int32 suffix_dim_size;
    362     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
    363         Base::template SetDims<int32>(input_shape, split_dim);
    364 
    365     // use the same approach as concat (see documentation there)
    366     // reshape to 2D
    367 
    368     if (num_split > 16) {
    369       GpuDeviceArrayOnHost<T*> ptrs(context, num_split);
    370       OP_REQUIRES_OK(context, ptrs.Init());
    371 
    372       GpuDeviceArrayOnHost<Tlen> offsets(context, num_split + 1);
    373       OP_REQUIRES_OK(context, offsets.Init());
    374 
    375       Tlen offset = 0;
    376       int entry = split_sizes_vec[0];
    377       bool fixed_size =
    378           std::all_of(split_sizes_vec.begin(), split_sizes_vec.end(),
    379                       [&entry](int n) { return n == entry; });
    380 
    381       for (int i = 0; i < num_split; ++i) {
    382         TensorShape output_shape(input_shape);
    383         output_shape.set_dim(split_dim, split_sizes_vec[i]);
    384         Tensor* result = nullptr;
    385         OP_REQUIRES_OK(context,
    386                        context->allocate_output(i, output_shape, &result));
    387         ptrs.Set(i, result->flat<T>().data());
    388         offsets.Set(i, offset);
    389         offset += split_sizes_vec[i] * suffix_dim_size;
    390       }
    391       offsets.Set(num_split, offset);
    392       OP_REQUIRES_OK(context, ptrs.Finalize());
    393       OP_REQUIRES_OK(context, offsets.Finalize());
    394 
    395       if (input.NumElements() > 0) {
    396         SplitVOpGPULaunch<T, Tlen>().Run(
    397             context->eigen_device<GPUDevice>(), fixed_size,
    398             input.flat<T>().data(), prefix_dim_size,
    399             input.NumElements() / prefix_dim_size, offsets.data(), ptrs.data());
    400         OP_REQUIRES(
    401             context, context->op_device_context()->stream()->ok(),
    402             errors::Internal("Launch of gpu kernel for SplitVOp failed"));
    403       }
    404     } else {
    405       Eigen::DenseIndex prefix_dim_size;
    406       Eigen::DenseIndex split_dim_size;
    407       Eigen::DenseIndex suffix_dim_size;
    408 
    409       std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
    410           Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
    411       auto input_reshaped = input.shaped<T, 2>(
    412           {prefix_dim_size, split_dim_size * suffix_dim_size});
    413 
    414       Eigen::DSizes<Eigen::DenseIndex, 2> indices{0, 0};
    415 
    416       for (int i = 0; i < num_split; ++i) {
    417         TensorShape output_shape(input_shape);
    418         output_shape.set_dim(split_dim, split_sizes_vec[i]);
    419         Tensor* result = nullptr;
    420         OP_REQUIRES_OK(context,
    421                        context->allocate_output(i, output_shape, &result));
    422 
    423         Eigen::DSizes<Eigen::DenseIndex, 2> sizes{
    424             prefix_dim_size, split_sizes_vec[i] * suffix_dim_size};
    425 
    426         if (sizes.TotalSize() > 0) {
    427           auto result_shaped = result->shaped<T, 2>(
    428               {prefix_dim_size, split_sizes_vec[i] * suffix_dim_size});
    429 
    430           functor::SplitCustom<GPUDevice, T>()(
    431               context->eigen_device<GPUDevice>(), result_shaped, input_reshaped,
    432               indices, sizes);
    433         }
    434         indices[1] += split_sizes_vec[i] * suffix_dim_size;
    435       }
    436     }
    437   }
    438 };
    439 #endif  // GOOGLE_CUDA
    440 
    441 #define REGISTER_SPLIT(type, len_type)                          \
    442   REGISTER_KERNEL_BUILDER(Name("SplitV")                        \
    443                               .Device(DEVICE_CPU)               \
    444                               .TypeConstraint<len_type>("Tlen") \
    445                               .TypeConstraint<type>("T")        \
    446                               .HostMemory("size_splits")        \
    447                               .HostMemory("split_dim"),         \
    448                           SplitVOpCPU<type, len_type>);
    449 
    450 #define REGISTER_SPLIT_LEN(type) \
    451   REGISTER_SPLIT(type, int32);   \
    452   REGISTER_SPLIT(type, int64);
    453 
    454 TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
    455 
    456 #undef REGISTER_SPLIT_LEN
    457 #undef REGISTER_SPLIT
    458 
    459 #if GOOGLE_CUDA
    460 
    461 #define REGISTER_GPU(type, len_type)                            \
    462   REGISTER_KERNEL_BUILDER(Name("SplitV")                        \
    463                               .Device(DEVICE_GPU)               \
    464                               .TypeConstraint<len_type>("Tlen") \
    465                               .TypeConstraint<type>("T")        \
    466                               .HostMemory("size_splits")        \
    467                               .HostMemory("split_dim"),         \
    468                           SplitVOpGPU<type, len_type>);
    469 
    470 #define REGISTER_GPU_LEN(type) \
    471   REGISTER_GPU(type, int32);   \
    472   REGISTER_GPU(type, int64);
    473 
    474 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_LEN);
    475 TF_CALL_complex64(REGISTER_GPU_LEN);
    476 TF_CALL_complex128(REGISTER_GPU_LEN);
    477 REGISTER_GPU_LEN(bfloat16);
    478 #undef REGISTER_GPU_LEN
    479 #undef REGISTER_GPU
    480 
    481 // special GPU kernel for int32
    482 
    483 #define REGISTER_GPU_int32(len_type)                            \
    484   REGISTER_KERNEL_BUILDER(Name("SplitV")                        \
    485                               .Device(DEVICE_GPU)               \
    486                               .TypeConstraint<int32>("T")       \
    487                               .TypeConstraint<len_type>("Tlen") \
    488                               .HostMemory("size_splits")        \
    489                               .HostMemory("split_dim")          \
    490                               .HostMemory("value")              \
    491                               .HostMemory("output"),            \
    492                           SplitVOpCPU<int32, len_type>);
    493 
    494 REGISTER_GPU_int32(int32);
    495 REGISTER_GPU_int32(int64);
    496 
    497 #undef REGISTER_GPU_int32
    498 
    499 #endif  // GOOGLE_CUDA
    500 
    501 }  // end namespace tensorflow
    502