Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/math_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #include <numeric>
     21 
     22 #include "tensorflow/core/kernels/aggregate_ops.h"
     23 #include "tensorflow/core/kernels/aggregate_ops_cpu.h"
     24 
     25 #include "tensorflow/core/framework/numeric_op.h"
     26 #include "tensorflow/core/framework/register_types.h"
     27 #include "tensorflow/core/framework/variant.h"
     28 #include "tensorflow/core/framework/variant_encode_decode.h"
     29 #include "tensorflow/core/framework/variant_op_registry.h"
     30 #include "tensorflow/core/lib/gtl/inlined_vector.h"
     31 #include "tensorflow/core/platform/logging.h"
     32 
     33 namespace tensorflow {
     34 
     35 typedef Eigen::ThreadPoolDevice CPUDevice;
     36 typedef Eigen::GpuDevice GPUDevice;
     37 #ifdef TENSORFLOW_USE_SYCL
     38 typedef Eigen::SyclDevice SYCLDevice;
     39 #endif  // TENSORFLOW_USE_SYCL
     40 
     41 template <typename Device, typename T>
     42 class AddNOp : public OpKernel {
     43  public:
     44   explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {}
     45 
     46   void Compute(OpKernelContext* ctx) override {
     47     if (!ctx->ValidateInputsAreSameShape(this)) return;
     48 
     49     const Tensor& input0 = ctx->input(0);
     50     const int num = ctx->num_inputs();
     51 
     52     if (num == 1) {
     53       ctx->set_output(0, input0);
     54       return;
     55     }
     56 
     57     // Try to forward and accumulate the result in one of the input buffers.
     58     int reused_input = -1;
     59     gtl::InlinedVector<int, 8> input_indices(num);
     60     std::iota(input_indices.begin(), input_indices.end(), 0);
     61     Tensor* output = nullptr;
     62     for (int input_idx = 0; input_idx < num; ++input_idx) {
     63       if (ctx->forward_input_to_output_with_shape(input_idx, 0, input0.shape(),
     64                                                   &output)) {
     65         reused_input = input_idx;
     66         break;
     67       }
     68     }
     69     if (reused_input == -1) {
     70       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
     71     } else if (reused_input > 0) {
     72       // Move the forwarded buffer to the front so we don't double count
     73       // anything if there are more than 8 inputs.
     74       input_indices[0] = reused_input;
     75       input_indices[reused_input] = 0;
     76     }
     77     auto To = output->flat<T>();
     78 
     79 #define I(IDX) ctx->input(input_indices[IDX]).flat<T>()
     80 
     81 #if defined(__ANDROID_TYPES_SLIM__)
     82     // On Android by default,we only support additions of two arguments, so we
     83     // can reduce the number of template instantiations.
     84     OP_REQUIRES(ctx, num == 2,
     85                 errors::InvalidArgument("Only additions of two arguments "
     86                                         "supported. Num inputs: ",
     87                                         num));
     88     functor::Add2Functor<Device, T> functor2;
     89     functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
     90 #else
     91     static const int kWidth = 8;
     92     int r = num % kWidth;
     93 
     94     switch (r) {
     95       case 2: {
     96         functor::Add2Functor<Device, T> functor2;
     97         functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
     98         break;
     99       }
    100       case 3: {
    101         functor::Add3Functor<Device, T> functor3;
    102         functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2));
    103         break;
    104       }
    105       case 4: {
    106         functor::Add4Functor<Device, T> functor4;
    107         functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
    108                  I(3));
    109         break;
    110       }
    111       case 5: {
    112         functor::Add5Functor<Device, T> functor5;
    113         functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
    114                  I(3), I(4));
    115         break;
    116       }
    117       case 6: {
    118         functor::Add6Functor<Device, T> functor6;
    119         functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
    120                  I(3), I(4), I(5));
    121         break;
    122       }
    123       case 7: {
    124         functor::Add7Functor<Device, T> functor7;
    125         functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
    126                  I(3), I(4), I(5), I(6));
    127         break;
    128       }
    129       case 0: {
    130         functor::Add8Functor<Device, T> functor8;
    131         functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
    132                  I(3), I(4), I(5), I(6), I(7));
    133         r = 8;
    134         break;
    135       }
    136       case 1: {
    137         functor::Add9Functor<Device, T> functor9;
    138         functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
    139                  I(3), I(4), I(5), I(6), I(7), I(8));
    140         r = 9;
    141         break;
    142       }
    143     }
    144 
    145     for (; r < num; r += kWidth) {
    146       functor::Add8pFunctor<Device, T> functor8p;
    147       functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1),
    148                 I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7));
    149     }
    150 #endif  // defined(__ANDROID_TYPES_SLIM__)
    151 
    152 #undef I
    153   }
    154 };
    155 
    156 template <typename Device>
    157 class AddNOp<Device, Variant> : public OpKernel {
    158  public:
    159   explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {}
    160 
    161   void Compute(OpKernelContext* ctx) override {
    162     if (!ctx->ValidateInputsAreSameShape(this)) return;
    163 
    164     const Tensor& input0 = ctx->input(0);
    165     const int num = ctx->num_inputs();
    166 
    167     if (num == 1) {
    168       ctx->set_output(0, input0);
    169       return;
    170     }
    171 
    172     for (int i = 0; i < num; ++i) {
    173       // Step 1: ensure unary variants.
    174       OP_REQUIRES(
    175           ctx, ctx->input(i).dims() == 0,
    176           errors::InvalidArgument(
    177               "AddN of non-scalar Tensor with dtype=DT_VARIANT is not "
    178               "supported; inputs[",
    179               i, " has shape: ", ctx->input(i).shape().DebugString(), "."));
    180     }
    181 
    182     TensorShape common_shape;
    183     OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(0), &common_shape));
    184     // Step 2: access all variants and ensure shapes match.
    185     for (int i = 1; i < num; ++i) {
    186       TensorShape check_shape;
    187       OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(i), &check_shape));
    188       OP_REQUIRES(ctx, common_shape == check_shape,
    189                   errors::InvalidArgument(
    190                       "AddN of Variants of differing shapes; inputs[0] shape: ",
    191                       common_shape.DebugString(), ", inputs[", i,
    192                       "] shape: ", check_shape.DebugString()));
    193     }
    194 
    195     // Step 3: attempt to add using
    196     //   BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...)
    197     //   For the output create a default-constructed variant object.
    198     // TODO(ebrevdo): Perform summation in a tree-structure.
    199     Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
    200     Variant* v_out = &(out.scalar<Variant>()());
    201     OP_REQUIRES_OK(
    202         ctx, BinaryOpVariants<Device>(
    203                  ctx, ADD_VARIANT_BINARY_OP, ctx->input(0).scalar<Variant>()(),
    204                  ctx->input(1).scalar<Variant>()(), v_out));
    205     for (int i = 2; i < num; ++i) {
    206       const Variant tmp = std::move(*v_out);
    207       const Variant& inp = ctx->input(i).scalar<Variant>()();
    208       OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP,
    209                                                    inp, tmp, v_out));
    210     }
    211     ctx->set_output(0, out);
    212   }
    213 };
    214 
    215 #define REGISTER_ADDN(type, dev)                                   \
    216   REGISTER_KERNEL_BUILDER(                                         \
    217       Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
    218       AddNOp<dev##Device, type>)
    219 
    220 #define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU)
    221 
    222 TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU);
    223 REGISTER_ADDN_CPU(Variant);
    224 
    225 #undef REGISTER_ADDN_CPU
    226 
    227 #if GOOGLE_CUDA
    228 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
    229 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
    230 TF_CALL_complex64(REGISTER_ADDN_GPU);
    231 TF_CALL_complex128(REGISTER_ADDN_GPU);
    232 TF_CALL_variant(REGISTER_ADDN_GPU);
    233 #undef REGISTER_ADDN_GPU
    234 
    235 // A special GPU kernel for int32.
    236 // TODO(b/25387198): Also enable int32 in device memory. This kernel
    237 // registration requires all int32 inputs and outputs to be in host memory.
    238 REGISTER_KERNEL_BUILDER(Name("AddN")
    239                             .Device(DEVICE_GPU)
    240                             .TypeConstraint<int32>("T")
    241                             .HostMemory("inputs")
    242                             .HostMemory("sum"),
    243                         AddNOp<CPUDevice, int32>);
    244 
    245 #endif  // GOOGLE_CUDA
    246 
    247 #ifdef TENSORFLOW_USE_SYCL
    248 REGISTER_ADDN(float, SYCL);
    249 REGISTER_ADDN(double, SYCL);
    250 
    251 // A special GPU kernel for int32.
    252 // TODO(b/25387198): Also enable int32 in device memory. This kernel
    253 // registration requires all int32 inputs and outputs to be in host memory.
    254 REGISTER_KERNEL_BUILDER(Name("AddN")
    255                             .Device(DEVICE_SYCL)
    256                             .TypeConstraint<int32>("T")
    257                             .HostMemory("inputs")
    258                             .HostMemory("sum"),
    259                         AddNOp<CPUDevice, int32>);
    260 #endif  // TENSORFLOW_USE_SYCL
    261 
    262 #undef REGISTER_ADDN
    263 
    264 }  // namespace tensorflow
    265