Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/array_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #include "tensorflow/core/kernels/transpose_op.h"
     21 
     22 #include "tensorflow/core/framework/op_kernel.h"
     23 #include "tensorflow/core/framework/register_types.h"
     24 #include "tensorflow/core/framework/tensor.h"
     25 #include "tensorflow/core/framework/tensor_shape.h"
     26 #include "tensorflow/core/kernels/bounds_check.h"
     27 #include "tensorflow/core/kernels/transpose_functor.h"
     28 #include "tensorflow/core/lib/core/status.h"
     29 #include "tensorflow/core/lib/strings/str_util.h"
     30 #include "tensorflow/core/platform/logging.h"
     31 
     32 namespace tensorflow {
     33 
     34 // inv = InvertPermutationOp(T<int32/int64> p) takes a permutation of
     35 // integers 0, 1, ..., n - 1 and returns the inverted
     36 // permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n).
     37 //
     38 // REQUIRES: input is a vector of int32 or int64.
     39 // REQUIRES: input is a permutation of 0, 1, ..., n-1.
     40 
     41 template <typename T>
     42 class InvertPermutationOp : public OpKernel {
     43  public:
     44   explicit InvertPermutationOp(OpKernelConstruction* context)
     45       : OpKernel(context) {}
     46 
     47   void Compute(OpKernelContext* context) override {
     48     const Tensor& input = context->input(0);
     49     OP_REQUIRES(
     50         context, TensorShapeUtils::IsVector(input.shape()),
     51         errors::InvalidArgument("invert_permutation expects a 1D vector."));
     52     auto Tin = input.vec<T>();
     53     OP_REQUIRES(context,
     54                 FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
     55                 errors::InvalidArgument("permutation of nonnegative int32s "
     56                                         "must have <= int32 max elements"));
     57     const T N = static_cast<T>(Tin.size());  // Safe: bounds-checked above.
     58     Tensor* output = nullptr;
     59     OP_REQUIRES_OK(context,
     60                    context->allocate_output(0, input.shape(), &output));
     61     auto Tout = output->vec<T>();
     62     std::fill_n(Tout.data(), N, -1);
     63     for (int i = 0; i < N; ++i) {
     64       const T d = internal::SubtleMustCopy(Tin(i));
     65       OP_REQUIRES(context, FastBoundsCheck(d, N),
     66                   errors::InvalidArgument(d, " is not between 0 and ", N));
     67       OP_REQUIRES(context, Tout(d) == -1,
     68                   errors::InvalidArgument(d, " is duplicated in the input."));
     69       Tout(d) = i;
     70     }
     71   }
     72 };
     73 
     74 REGISTER_KERNEL_BUILDER(
     75     Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
     76     InvertPermutationOp<int32>);
     77 REGISTER_KERNEL_BUILDER(
     78     Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int64>("T"),
     79     InvertPermutationOp<int64>);
     80 
     81 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
     82                             .Device(DEVICE_GPU)
     83                             .TypeConstraint<int32>("T")
     84                             .HostMemory("x")
     85                             .HostMemory("y"),
     86                         InvertPermutationOp<int32>);
     87 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
     88                             .Device(DEVICE_GPU)
     89                             .TypeConstraint<int64>("T")
     90                             .HostMemory("x")
     91                             .HostMemory("y"),
     92                         InvertPermutationOp<int64>);
     93 
     94 #ifdef TENSORFLOW_USE_SYCL
     95 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
     96                             .Device(DEVICE_SYCL)
     97                             .TypeConstraint<int32>("T")
     98                             .HostMemory("x")
     99                             .HostMemory("y"),
    100                         InvertPermutationOp<int32>);
    101 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
    102                             .Device(DEVICE_SYCL)
    103                             .TypeConstraint<int64>("T")
    104                             .HostMemory("x")
    105                             .HostMemory("y"),
    106                         InvertPermutationOp<int64>);
    107 #endif  // TENSORFLOW_USE_SYCL
    108 
    109 namespace {
    110 template <typename Tperm>
    111 Status PermutationHelper(const Tensor& perm, const int dims,
    112                          std::vector<int32>* permutation) {
    113   auto Vperm = perm.vec<Tperm>();
    114   if (dims != Vperm.size()) {
    115     return errors::InvalidArgument("transpose expects a vector of size ", dims,
    116                                    ". But input(1) is a vector of size ",
    117                                    Vperm.size());
    118   }
    119   // using volatile instead of SubtleMustCopy here so that the
    120   // asynchrony boundary is permutation.
    121   const volatile Tperm* perm_begin =
    122       reinterpret_cast<const volatile Tperm*>(Vperm.data());
    123   *permutation = std::vector<int32>(perm_begin, perm_begin + dims);
    124 
    125   return Status::OK();
    126 }
    127 }  // namespace
    128 
    129 // output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
    130 // of type T and rank N, and a permutation of 0, 1, ..., N-1. It
    131 // shuffles the dimensions of the input tensor according to permutation.
    132 //
    133 // Specifically, the returned tensor output meets the following condition:
    134 // 1) output.dims() == input.dims();
    135 // 2) output.dim_size(i) == input.dim_size(perm[i]);
    136 // 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
    137 //      input.tensor<T, N>(j_0, j_1, ..., j_N-1),
    138 //    where i_s == j_{perm[s]}
    139 //
    140 // REQUIRES: perm is a vector of int32.
    141 // REQUIRES: input.dims() == perm.size().
    142 // REQUIRES: perm is a permutation.
    143 
    144 void TransposeOp::Compute(OpKernelContext* ctx) {
    145   const Tensor& input = ctx->input(0);
    146   const Tensor& perm = ctx->input(1);
    147   // Preliminary validation of sizes.
    148   OP_REQUIRES(ctx, TensorShapeUtils::IsVector(perm.shape()),
    149               errors::InvalidArgument("perm must be a vector, not ",
    150                                       perm.shape().DebugString()));
    151 
    152   // Although Tperm may be an int64 type, an int32 is sufficient to hold
    153   // dimension range values, so the narrowing here should be safe.
    154   std::vector<int32> permutation;
    155   const int dims = input.dims();
    156   if (perm.dtype() == DT_INT32) {
    157     OP_REQUIRES_OK(ctx, PermutationHelper<int32>(perm, dims, &permutation));
    158   } else {
    159     OP_REQUIRES_OK(ctx, PermutationHelper<int64>(perm, dims, &permutation));
    160   }
    161   TensorShape shape;
    162 
    163   // Check whether permutation is a permutation of integers of [0 .. dims).
    164   gtl::InlinedVector<bool, 8> bits(dims);
    165   bool is_identity = true;
    166   for (int i = 0; i < dims; ++i) {
    167     const int32 d = permutation[i];
    168     OP_REQUIRES(
    169         ctx, 0 <= d && d < dims,
    170         errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
    171     bits[d] = true;
    172     const auto dim_size = input.dim_size(d);
    173     shape.AddDim(dim_size);
    174     if (d != i) {
    175       is_identity = false;
    176     }
    177   }
    178   for (int i = 0; i < dims; ++i) {
    179     OP_REQUIRES(
    180         ctx, bits[i],
    181         errors::InvalidArgument(i, " is missing from {",
    182                                 str_util::Join(permutation, ","), "}."));
    183   }
    184 
    185   // 0-D, 1-D, and identity transposes do nothing.
    186   if (!IsConjugate() && (dims <= 1 || is_identity)) {
    187     ctx->set_output(0, input);
    188     return;
    189   } else if (!IsConjugate() && internal::NonSingletonDimensionsAlign(
    190                                    input.shape(), permutation)) {
    191     Tensor output;
    192     OP_REQUIRES(ctx, output.CopyFrom(input, shape),
    193                 errors::Unknown("Error reshaping Tensor."));
    194     ctx->set_output(0, output);
    195     return;
    196   }
    197 
    198   Tensor* output = nullptr;
    199   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
    200   if (shape.num_elements() > 0) {
    201     OP_REQUIRES_OK(ctx, DoTranspose(ctx, input, permutation, output));
    202   }
    203 }
    204 
    205 Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
    206                                    gtl::ArraySlice<int32> perm, Tensor* out) {
    207   typedef Eigen::ThreadPoolDevice CPUDevice;
    208   return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
    209                                    out);
    210 }
    211 
    212 Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
    213                                             const Tensor& in,
    214                                             gtl::ArraySlice<int32> perm,
    215                                             Tensor* out) {
    216   typedef Eigen::ThreadPoolDevice CPUDevice;
    217   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<CPUDevice>(), in,
    218                                             perm, out);
    219 }
    220 
    221 #ifdef INTEL_MKL
    222 #define REGISTER(T)                                   \
    223   REGISTER_KERNEL_BUILDER(Name("Transpose")           \
    224                               .Device(DEVICE_CPU)     \
    225                               .TypeConstraint<T>("T") \
    226                               .HostMemory("perm"),    \
    227                           MklTransposeCpuOp);         \
    228   REGISTER_KERNEL_BUILDER(Name("ConjugateTranspose")  \
    229                               .Device(DEVICE_CPU)     \
    230                               .TypeConstraint<T>("T") \
    231                               .HostMemory("perm"),    \
    232                           MklConjugateTransposeCpuOp);
    233 TF_CALL_ALL_TYPES(REGISTER);
    234 #undef REGISTER
    235 
    236 #else  // INTEL_MKL
    237 
    238 #define REGISTER(T)                                   \
    239   REGISTER_KERNEL_BUILDER(Name("Transpose")           \
    240                               .Device(DEVICE_CPU)     \
    241                               .TypeConstraint<T>("T") \
    242                               .HostMemory("perm"),    \
    243                           TransposeCpuOp);            \
    244   REGISTER_KERNEL_BUILDER(Name("ConjugateTranspose")  \
    245                               .Device(DEVICE_CPU)     \
    246                               .TypeConstraint<T>("T") \
    247                               .HostMemory("perm"),    \
    248                           ConjugateTransposeCpuOp);
    249 TF_CALL_ALL_TYPES(REGISTER)
    250 #undef REGISTER
    251 #endif  // INTEL_MKL
    252 
    253 #if GOOGLE_CUDA
    254 Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
    255                                    gtl::ArraySlice<int32> perm, Tensor* out) {
    256   typedef Eigen::GpuDevice GPUDevice;
    257   return ::tensorflow::DoTranspose(ctx->eigen_device<GPUDevice>(), in, perm,
    258                                    out);
    259 }
    260 Status ConjugateTransposeGpuOp::DoTranspose(OpKernelContext* ctx,
    261                                             const Tensor& in,
    262                                             gtl::ArraySlice<int32> perm,
    263                                             Tensor* out) {
    264   typedef Eigen::GpuDevice GPUDevice;
    265   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<GPUDevice>(), in,
    266                                             perm, out);
    267 }
    268 
    269 #define REGISTER(T)                                   \
    270   REGISTER_KERNEL_BUILDER(Name("Transpose")           \
    271                               .Device(DEVICE_GPU)     \
    272                               .TypeConstraint<T>("T") \
    273                               .HostMemory("perm"),    \
    274                           TransposeGpuOp);            \
    275   REGISTER_KERNEL_BUILDER(Name("ConjugateTranspose")  \
    276                               .Device(DEVICE_GPU)     \
    277                               .TypeConstraint<T>("T") \
    278                               .HostMemory("perm"),    \
    279                           ConjugateTransposeGpuOp);
    280 TF_CALL_POD_TYPES(REGISTER);
    281 #undef REGISTER
    282 #endif
    283 
    284 #ifdef TENSORFLOW_USE_SYCL
    285 Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
    286                                     gtl::ArraySlice<int32> perm, Tensor* out) {
    287   typedef Eigen::SyclDevice SYCLDevice;
    288   return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
    289                                    out);
    290 }
    291 Status ConjugateTransposeSyclOp::DoTranspose(OpKernelContext* ctx,
    292                                              const Tensor& in,
    293                                              gtl::ArraySlice<int32> perm,
    294                                              Tensor* out) {
    295   typedef Eigen::SyclDevice SYCLDevice;
    296   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<SYCLDevice>(), in,
    297                                             perm, out);
    298 }
    299 #define REGISTER(T)                                   \
    300   REGISTER_KERNEL_BUILDER(Name("Transpose")           \
    301                               .Device(DEVICE_SYCL)    \
    302                               .TypeConstraint<T>("T") \
    303                               .HostMemory("perm"),    \
    304                           TransposeSyclOp);           \
    305   REGISTER_KERNEL_BUILDER(Name("ConjugateTranspose")  \
    306                               .Device(DEVICE_SYCL)    \
    307                               .TypeConstraint<T>("T") \
    308                               .HostMemory("perm"),    \
    309                           ConjugateTransposeSyclOp);
    310 TF_CALL_POD_TYPES(REGISTER);
    311 #undef REGISTER
    312 #endif
    313 }  // namespace tensorflow
    314