Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // This is an internal header file intended to only be included as the
     17 // front-matter in the implementation files of various reduction ops.  It
     18 // is a header file because we split the various reduction ops into their
     19 // own compilation units to get more parallelism in compilation.
     20 
     21 #ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
     22 #define TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
     23 
     24 #define EIGEN_USE_THREADS
     25 
     26 #include "third_party/eigen3/Eigen/Core"
     27 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     28 
     29 #include "tensorflow/core/framework/numeric_op.h"
     30 #include "tensorflow/core/framework/op_kernel.h"
     31 #include "tensorflow/core/framework/register_types.h"
     32 #include "tensorflow/core/framework/tensor.h"
     33 #include "tensorflow/core/framework/types.h"
     34 #include "tensorflow/core/kernels/reduction_ops.h"
     35 #include "tensorflow/core/kernels/transpose_functor.h"
     36 #include "tensorflow/core/lib/core/status.h"
     37 #include "tensorflow/core/lib/gtl/inlined_vector.h"
     38 #include "tensorflow/core/platform/logging.h"
     39 
     40 namespace tensorflow {
     41 
     42 typedef Eigen::ThreadPoolDevice CPUDevice;
     43 typedef Eigen::GpuDevice GPUDevice;
     44 #ifdef TENSORFLOW_USE_SYCL
     45 typedef Eigen::SyclDevice SYCLDevice;
     46 #endif  // TENSORFLOW_USE_SYCL
     47 
     48 template <typename Device>
     49 struct Constants {
     50   // Derive Index type. int (32-bit) or long (64-bit) depending on the
     51   // compile-time configuration. "float" here is not relevant.
     52   // TODO(zhifengc): Moves the definition to TTypes.
     53   typedef TTypes<float>::Tensor::Index Index;
     54   Eigen::array<Index, 1> kZero;
     55   Eigen::array<Index, 1> kOne;
     56   Eigen::array<Index, 2> kZeroTwo;
     57 
     58   Constants() {
     59     kZero[0] = 0;
     60     kOne[0] = 1;
     61     kZeroTwo[0] = 0;
     62     kZeroTwo[1] = 2;
     63   }
     64 };
     65 
     66 #if defined(EIGEN_HAS_INDEX_LIST)
     67 struct ConstantsBase {
     68   const Eigen::IndexList<Eigen::type2index<0>> kZero;
     69   const Eigen::IndexList<Eigen::type2index<1>> kOne;
     70   const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo;
     71 };
     72 template <>
     73 struct Constants<CPUDevice> : ConstantsBase {};
     74 #ifdef TENSORFLOW_USE_SYCL
     75 template <>
     76 struct Constants<SYCLDevice> : ConstantsBase {};
     77 #endif  // TENSORFLOW_USE_SYCL
     78 #endif  // EIGEN_HAS_INDEX_LIST
     79 
     80 class ReductionHelper {
     81  public:
     82   ReductionHelper() : reduce_first_axis_(false) {}
     83 
     84   Status Simplify(const Tensor& data, const Tensor& axis, const bool keep_dims);
     85 
     86   // We need to do roughly:
     87   //   tmp_out = allocate(out_reshape())
     88   //   tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes)
     89   //   out = tmp_out.reshape(out_shape)
     90 
     91   // The reduction result must be allocated with this shape.
     92   TensorShape out_reshape() const;
     93 
     94   // The final output shape must be allocated with this shape.
     95   TensorShape out_shape() const;
     96 
     97   // The reduction is on a reshaped tensor of this rank.
     98   int ndims() const { return data_reshape_.size(); }
     99 
    100   // True if need to reduce the 0-th dimension.
    101   bool reduce_first_axis() const { return reduce_first_axis_; }
    102 
    103   // The output is reshaped.
    104   template <typename T, int N>
    105   typename TTypes<T, N>::Tensor out(Tensor* out) {
    106     return out->shaped<T, N>(out_reshape_);
    107   }
    108 
    109   // The input is reshaped.
    110   template <typename T, int N>
    111   typename TTypes<T, N>::ConstTensor in(const Tensor& data) {
    112     return data.shaped<T, N>(data_reshape_);
    113   }
    114 
    115   // Shape of shuffled input
    116   TensorShape data_reshape() const {
    117     TensorShape shape;
    118     for (auto s : data_reshape_) shape.AddDim(s);
    119     return shape;
    120   }
    121 
    122   // Shape with all reduction dimensions at the end
    123   TensorShape shuffled_shape();
    124 
    125   // Permutation of reduced dims needed to put reduction dimensions at the end
    126   gtl::InlinedVector<int32, 8> permutation();
    127 
    128  private:
    129   bool reduce_first_axis_;  // True if need to reduce the 0-th dimension.
    130   gtl::InlinedVector<int64, 4> data_reshape_;  // Reshape data before reduction.
    131   gtl::InlinedVector<int64, 4> out_shape_;     // The final output shape.
    132   gtl::InlinedVector<int64, 4> out_reshape_;   // Reshape output for reduction.
    133 };
    134 
    135 // For operations where the output is a reduction function along some
    136 // dimensions of the input.
    137 template <typename Device, class T, typename Tperm, typename Reducer>
    138 class ReductionOp : public OpKernel {
    139  public:
    140   explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    141     const DataType dt = DataTypeToEnum<T>::v();
    142     const DataType pt = DataTypeToEnum<Tperm>::v();
    143     OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, pt}, {dt}));
    144 
    145     OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
    146   }
    147 
    148   void Compute(OpKernelContext* ctx) override {
    149     const Tensor& data = ctx->input(0);
    150     const Tensor& axes = ctx->input(1);
    151     VLOG(1) << "data shape: " << data.shape().DebugString();
    152     VLOG(1) << "axes      : " << axes.SummarizeValue(10);
    153 
    154     ReductionHelper helper;
    155     OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
    156     CHECK_GE(helper.ndims(), 0);
    157 
    158     if (helper.ndims() == 0 ||
    159         (helper.ndims() == 1 && !helper.reduce_first_axis())) {
    160       // Special case. Reduces nothing.  It is unclear why this is
    161       // necessary, but tests fail without it.  Look into why this
    162       // case occurs.
    163       Tensor out;
    164       if (!out.CopyFrom(data, helper.out_shape())) {
    165         ctx->SetStatus(errors::Internal("Error during reduction copy."));
    166       }
    167       ctx->set_output(0, out);
    168       return;
    169     }
    170 
    171     // We must allocate temp tensors using the same alloc attr as
    172     // output(0) because it is returned as output(0) in the end.
    173     const AllocatorAttributes alloc_attr = ctx->output_alloc_attr(0);
    174 
    175     // A temporary tensor whose size matches the size of the reduced
    176     // output.
    177     Tensor tmp_out;
    178     OP_REQUIRES_OK(
    179         ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
    180                                 helper.out_reshape(), &tmp_out, alloc_attr));
    181 
    182     typedef functor::ReduceFunctor<Device, Reducer> Functor;
    183     Constants<Device> constants;
    184     const Device& d = ctx->eigen_device<Device>();
    185     Reducer reducer;
    186 
    187     if (tmp_out.NumElements() == 0) {
    188       // Nothing to do, fall through to final reshaping.
    189     } else if (data.NumElements() == 0) {
    190       // Degenerate reduction where the input is empty but the output is
    191       // nonempty (thus tmp_out.NumElements() > 0), and we must fill the output
    192       // with identity elements.  Example: tf.reduce_sum(tf.zeros((0, 3)), [0]).
    193       // Eigen sometimes crashes in this case, so we do it manually.
    194       Functor::FillIdentity(d, tmp_out.flat<T>(), reducer);
    195     } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
    196       // Reduce to a scalar.
    197       Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
    198                       constants.kZero, reducer);
    199     } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
    200       // Can be viewed as a reduction of a matrix along 1st dimension.
    201       Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
    202                       constants.kZero, reducer);
    203     } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
    204       // Can be viewed as a reduction of a matrix along 2nd dimension.
    205       Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
    206                       constants.kOne, reducer);
    207     } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
    208       // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
    209       // dimensions.
    210       Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
    211                       constants.kZeroTwo, reducer);
    212     } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
    213       // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
    214       Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
    215                       constants.kOne, reducer);
    216     } else {
    217       // If we don't hit one of the cases above, transpose the data so that
    218       // all reduced dimensions are last and reuse the 2-D -> 1-D case.
    219       Tensor data_reshaped;
    220       CHECK(data_reshaped.CopyFrom(data, helper.data_reshape()));
    221       Tensor shuffled;
    222       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    223                                              helper.shuffled_shape(), &shuffled,
    224                                              alloc_attr));
    225       OP_REQUIRES_OK(
    226           ctx, DoTranspose(d, data_reshaped, helper.permutation(), &shuffled));
    227       const int64 unreduced = tmp_out.NumElements();
    228       const int64 reduced = shuffled.NumElements() / unreduced;
    229       const Tensor& const_shuffled = shuffled;
    230       Functor::Reduce(ctx, tmp_out.flat<T>(),
    231                       const_shuffled.shaped<T, 2>({unreduced, reduced}),
    232                       constants.kOne, reducer);
    233     }
    234 
    235     // Set the real output using the contents of the reduction but the
    236     // real expected output shape.  The number of elements should
    237     // match between the two shapes.
    238     Tensor out;
    239     if (!out.CopyFrom(tmp_out, helper.out_shape())) {
    240       ctx->SetStatus(errors::Internal("Error during reduction copy."));
    241     }
    242     ctx->set_output(0, out);
    243   }
    244 
    245  private:
    246   // True if the number of dimensions should be maintained.
    247   bool keep_dims_;
    248 };
    249 
    250 namespace functor {
    251 
    252 template <typename Device, typename Reducer>
    253 struct ReduceFunctorBase {
    254   template <typename OUT_T, typename IN_T, typename ReductionAxes>
    255   static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
    256                      const ReductionAxes& reduction_axes,
    257                      const Reducer& reducer) {
    258     const Device& d = ctx->eigen_device<Device>();
    259     ReduceEigenImpl(d, out, in, reduction_axes, reducer);
    260   }
    261 
    262   template <typename OUT_T>
    263   static void FillIdentity(const Device& d, OUT_T out, const Reducer& reducer) {
    264     FillIdentityEigenImpl(d, out, reducer);
    265   }
    266 };
    267 
    268 template <typename Reducer>
    269 struct ReduceFunctor<CPUDevice, Reducer>
    270     : ReduceFunctorBase<CPUDevice, Reducer> {};
    271 #if TENSORFLOW_USE_SYCL
    272 template <typename Reducer>
    273 struct ReduceFunctor<SYCLDevice, Reducer>
    274     : ReduceFunctorBase<SYCLDevice, Reducer> {};
    275 #endif  // TENSORFLOW_USE_SYCL
    276 
    277 }  // namespace functor
    278 }  // namespace tensorflow
    279 
    280 #endif  // TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
    281