Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/array_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #ifdef GOOGLE_CUDA
     21 #define EIGEN_USE_GPU
     22 #endif  // GOOGLE_CUDA
     23 
     24 #include <vector>
     25 
     26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     27 
     28 #include "tensorflow/core/framework/numeric_op.h"
     29 #include "tensorflow/core/framework/op_kernel.h"
     30 #include "tensorflow/core/framework/register_types.h"
     31 #include "tensorflow/core/framework/tensor.h"
     32 #include "tensorflow/core/framework/tensor_types.h"
     33 #include "tensorflow/core/framework/type_index.h"
     34 #include "tensorflow/core/lib/core/errors.h"
     35 #include "tensorflow/core/lib/gtl/array_slice.h"
     36 #include "tensorflow/core/platform/macros.h"
     37 #include "tensorflow/core/platform/types.h"
     38 
     39 namespace tensorflow {
     40 
     41 typedef Eigen::ThreadPoolDevice CPUDevice;
     42 typedef Eigen::GpuDevice GPUDevice;
     43 #ifdef TENSORFLOW_USE_SYCL
     44 typedef Eigen::SyclDevice SYCLDevice;
     45 #endif  // TENSORFLOW_USE_SYCL
     46 
     47 // Forward declarations of functors that will be defined in tile_ops_impl.h
     48 namespace functor {
     49 template <typename Device, typename T, typename Tmultiple>
     50 struct Tile {
     51   void operator()(const Device& d, Tensor* out, const Tensor& in,
     52                   const gtl::ArraySlice<Tmultiple> broadcast_array) const;
     53 };
     54 
     55 template <typename Device, typename T, int NDIM>
     56 struct TileGrad {
     57   void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
     58                   typename TTypes<T, NDIM>::ConstTensor in,
     59                   const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,
     60                   const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes,
     61                   bool first) const;
     62 };
     63 
     64 template <typename Device, typename T>
     65 struct TileGrad<Device, T, 0> {
     66   void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
     67                   typename TTypes<T, 0>::ConstTensor in,
     68                   const Eigen::DSizes<Eigen::DenseIndex, 0>&,
     69                   const Eigen::DSizes<Eigen::DenseIndex, 0>&, bool first) const;
     70 };
     71 
     72 template <typename Device, typename T, int NDIM, int REDUCEDNDIM>
     73 struct ReduceAndReshape {
     74   void operator()(
     75       const Device& d, typename TTypes<T, NDIM>::Tensor out,
     76       typename TTypes<T, NDIM>::ConstTensor in,
     77       const Eigen::DSizes<Eigen::DenseIndex, REDUCEDNDIM>& reduce_dim,
     78       const Eigen::DSizes<Eigen::DenseIndex, NDIM>& reshape_dim) const;
     79 };
     80 }  // namespace functor
     81 
     82 // --------------------------------------------------------------------------
     83 template <typename Device, typename Tmultiples>
     84 class TileOp : public OpKernel {
     85  public:
     86   explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {}
     87 
     88   void Compute(OpKernelContext* context) override {
     89     const Tensor& input = context->input(0);
     90     const Tensor& multiples = context->input(1);
     91 
     92     OP_REQUIRES(
     93         context, IsLegacyVector(multiples.shape()),
     94         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
     95                                 multiples.shape().DebugString()));
     96     OP_REQUIRES(context, input.dims() == multiples.NumElements(),
     97                 errors::InvalidArgument(
     98                     "Expected multiples argument to be a vector of length ",
     99                     input.dims(), " but got length ", multiples.dim_size(0)));
    100     const int input_dims = input.dims();
    101 
    102     // Eigen doesn't support scalars on the GPU, so handle 0-D specially
    103     if (input_dims == 0) {
    104       context->set_output(0, input);
    105       return;
    106     }
    107 
    108     const gtl::ArraySlice<Tmultiples> multiples_array(
    109         multiples.flat<Tmultiples>().data(), input_dims);
    110     TensorShape output_shape;
    111     for (int i = 0; i < input_dims; ++i) {
    112       OP_REQUIRES(
    113           context, multiples_array[i] >= 0,
    114           errors::InvalidArgument("Expected multiples[", i, "] >= 0, but got ",
    115                                   multiples_array[i]));
    116       output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
    117     }
    118     if (output_shape == input.shape()) {
    119       context->set_output(0, input);
    120       return;
    121     }
    122     Tensor* result = nullptr;
    123     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
    124 
    125     // If there's no output, there's nothing to do.
    126     if (output_shape.num_elements() == 0) return;
    127 
    128 #define HANDLE_TYPE(DT)                               \
    129   if (context->input(0).dtype() == DT) {              \
    130     HandleCase<DT>(context, multiples_array, result); \
    131     return;                                           \
    132   }
    133 
    134 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
    135 
    136     // Invoke macro using TF_CALL_* so type-filtering for platform applies.
    137     TF_CALL_bool(HANDLE_TYPE_NAME);
    138     TF_CALL_float(HANDLE_TYPE_NAME);
    139     TF_CALL_double(HANDLE_TYPE_NAME);
    140     TF_CALL_uint8(HANDLE_TYPE_NAME);
    141     TF_CALL_int32(HANDLE_TYPE_NAME);
    142     TF_CALL_int16(HANDLE_TYPE_NAME);
    143     TF_CALL_int64(HANDLE_TYPE_NAME);
    144     TF_CALL_half(HANDLE_TYPE_NAME);
    145     TF_CALL_string(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
    146     TF_CALL_complex64(HANDLE_TYPE_NAME);
    147     TF_CALL_complex128(HANDLE_TYPE_NAME);
    148 
    149 #undef HANDLE_TYPE_NAME
    150 #undef HANDLE_TYPE
    151 
    152     OP_REQUIRES(context, false,
    153                 errors::Unimplemented(
    154                     "TileOp : Unhandled input dimensions, DT : ",
    155                     context->input(0).dtype(), ", dims : ", input_dims));
    156   }
    157 
    158  private:
    159   template <DataType DT>
    160   void HandleCaseImpl(OpKernelContext* context,
    161                       const gtl::ArraySlice<Tmultiples>& multiples_array,
    162                       Tensor* result) {
    163     typedef typename EnumToDataType<DT>::Type T;
    164     functor::Tile<Device, T, Tmultiples>()(context->eigen_device<Device>(),
    165                                            result, context->input(0),
    166                                            multiples_array);
    167   }
    168 
    169   template <DataType DT>
    170   void HandleCase(OpKernelContext* context,
    171                   const gtl::ArraySlice<Tmultiples>& multiples_array,
    172                   Tensor* result);
    173 
    174   TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
    175 };
    176 
    177 template <typename Device, typename Tmultiples>
    178 template <DataType DT>
    179 inline void TileOp<Device, Tmultiples>::HandleCase(
    180     OpKernelContext* context,
    181     const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
    182   // TODO(vrv): print out the device name if useful. Currently disabled to avoid
    183   // having to use RTTI.
    184   LOG(FATAL) << "TileOp: Invalid combination of Device, DT: "
    185              // << typeid(Device).name() << ", "
    186              << DataTypeString(DT);
    187 }
    188 
    189 #define HANDLE_CASE(device, dtype, Tmultiples)                              \
    190   template <>                                                               \
    191   template <>                                                               \
    192   void TileOp<device, Tmultiples>::HandleCase<dtype>(                       \
    193       OpKernelContext * context,                                            \
    194       const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) { \
    195     HandleCaseImpl<dtype>(context, multiples_array, result);                \
    196   }
    197 
    198 #define HANDLE_TYPE_NAME_CPU(T)                            \
    199   HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value, int32); \
    200   HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value, int64);
    201 
    202 #define HANDLE_TYPE_NAME_GPU(T)                            \
    203   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int32); \
    204   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int64);
    205 
    206 #ifdef TENSORFLOW_USE_SYCL
    207 #define HANDLE_TYPE_NAME_SYCL(T)                            \
    208   HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int32); \
    209   HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int64);
    210 #endif  // TENSORFLOW_USE_SYCL
    211 
    212 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
    213 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
    214 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
    215 TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
    216 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
    217 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
    218 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
    219 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
    220 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
    221 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
    222 TF_CALL_string(HANDLE_TYPE_NAME_CPU);
    223 
    224 #if GOOGLE_CUDA
    225 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
    226 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
    227 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
    228 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
    229 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
    230 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
    231 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
    232 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
    233 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
    234 #endif  // GOOGLE_CUDA
    235 
    236 #ifdef TENSORFLOW_USE_SYCL
    237 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
    238 TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
    239 TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
    240 TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
    241 TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
    242 #endif  // TENSORFLOW_USE_SYCL
    243 
    244 #undef HANDLE_TYPE_NAME_CPU
    245 #undef HANDLE_TYPE_NAME_GPU
    246 #ifdef TENSORFLOW_USE_SYCL
    247 #undef HANDLE_TYPE_NAME_SYCL
    248 #endif  // TENSORFLOW_USE_SYCL
    249 #undef HANDLE_CASE
    250 
    251 // --------------------------------------------------------------------------
    252 template <typename Device, typename Tmultiples>
    253 class TileGradientOp : public OpKernel {
    254  public:
    255   explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {}
    256 
    257   void Compute(OpKernelContext* context) override {
    258     const Tensor& input = context->input(0);
    259     const Tensor& multiples = context->input(1);
    260     OP_REQUIRES(
    261         context, IsLegacyVector(multiples.shape()),
    262         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
    263                                 multiples.shape().DebugString()));
    264     OP_REQUIRES(context, input.dims() == multiples.NumElements(),
    265                 errors::InvalidArgument(
    266                     "Expected multiples argument to be a vector of length ",
    267                     input.dims(), " but got length ", multiples.dim_size(0)));
    268 
    269     const int input_dims = input.dims();
    270 
    271     // Eigen doesn't support scalars on the GPU, so handle 0-D specially
    272     if (input_dims == 0) {
    273       context->set_output(0, input);
    274       return;
    275     }
    276 
    277     const gtl::ArraySlice<Tmultiples> multiples_array(
    278         multiples.flat<Tmultiples>().data(), input_dims);
    279     TensorShape output_shape;
    280     std::vector<Tmultiples> input_dim_size_vec;
    281     for (int i = 0; i < input_dims; ++i) {
    282       OP_REQUIRES(
    283           context, multiples_array[i] > 0,
    284           errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
    285                                   multiples_array[i]));
    286       OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0,
    287                   errors::InvalidArgument("Expected input_dim[", i,
    288                                           "] to be divisible by multiples[", i,
    289                                           "], but ", input.dim_size(i), " % ",
    290                                           multiples_array[i], " != 0"));
    291       output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
    292       input_dim_size_vec.push_back(input.dim_size(i));
    293     }
    294     if (output_shape == input.shape()) {
    295       context->set_output(0, input);
    296       return;
    297     }
    298     Tensor* result = nullptr;
    299     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
    300 
    301 #define HANDLE_DIM(DT, NDIM)                                           \
    302   if (context->input(0).dtype() == DT && input_dims == NDIM) {         \
    303     HandleCase<DT, NDIM>(context, input_dim_size_vec, multiples_array, \
    304                          result);                                      \
    305     return;                                                            \
    306   }
    307 
    308 #define HANDLE_TYPE(T) \
    309   HANDLE_DIM(T, 1)     \
    310   HANDLE_DIM(T, 2)     \
    311   HANDLE_DIM(T, 3)     \
    312   HANDLE_DIM(T, 4)     \
    313   HANDLE_DIM(T, 5)     \
    314   HANDLE_DIM(T, 6)     \
    315   HANDLE_DIM(T, 7)
    316 
    317 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
    318 
    319     TF_CALL_float(HANDLE_TYPE_NAME);
    320     TF_CALL_double(HANDLE_TYPE_NAME);
    321     TF_CALL_int32(HANDLE_TYPE_NAME);
    322     TF_CALL_int16(HANDLE_TYPE_NAME);
    323     TF_CALL_int64(HANDLE_TYPE_NAME);
    324     TF_CALL_half(HANDLE_TYPE_NAME);
    325     TF_CALL_complex64(HANDLE_TYPE_NAME);
    326     TF_CALL_complex128(HANDLE_TYPE_NAME);
    327 
    328 #undef HANDLE_TYPE_NAME
    329 #undef HANDLE_TYPE
    330 #undef HANDLE_DIM
    331 
    332     OP_REQUIRES(context, false,
    333                 errors::Unimplemented(
    334                     "TileGradientOp : Unhandled input dimensions, DT : ",
    335                     context->input(0).dtype(), ", dims : ", input_dims));
    336   }
    337 
    338  private:
    339   template <DataType DT, int NDIM>
    340   void HandleCase(OpKernelContext* context,
    341                   const std::vector<Tmultiples>& input_dims,
    342                   const gtl::ArraySlice<Tmultiples>& multiples_array,
    343                   Tensor* result);
    344 
    345   template <DataType DT, int NDIM>
    346   void HandleCaseImpl(OpKernelContext* context,
    347                       const std::vector<Tmultiples>& input_dims,
    348                       const gtl::ArraySlice<Tmultiples>& multiples_array,
    349                       Tensor* result) {
    350     typedef typename EnumToDataType<DT>::Type T;
    351 
    352     bool reduction_only = true;
    353     std::vector<Tmultiples> reduction_dims;
    354 
    355     for (int i = 0; i < NDIM; ++i) {
    356       if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) {
    357         reduction_only = false;
    358         break;
    359       } else {
    360         if (multiples_array[i] == input_dims[i]) {
    361           reduction_dims.push_back(i);
    362         }
    363       }
    364     }
    365 
    366     if (reduction_only) {
    367 #define HANDLE_DIM(D)                                            \
    368   if (reduction_dims.size() == (D)) {                            \
    369     HandleReduce<T, NDIM, (D)>(context, reduction_dims, result); \
    370     return;                                                      \
    371   }
    372       // NOTE(keveman): Handling the most common case here.
    373       // Adding more cases here would require more templating and code
    374       // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1.
    375       HANDLE_DIM(1);
    376 
    377 // Fall through to the unoptimized version.
    378 #undef HANDLE_DIM
    379     }
    380 
    381     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
    382     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
    383 
    384     // Accumulate slices along the dimensions into the output. The number of
    385     // slices along dimension 'i' is simply the multiple along dimension 'i'
    386     // passed to the original Tile op.
    387     for (int i = 0; i < NDIM; ++i) {
    388       sizes[i] = input_dims[i] / multiples_array[i];
    389       indices[i] = 0;
    390     }
    391 
    392     bool first = true;
    393     while (true) {
    394       functor::TileGrad<Device, T, NDIM>()(
    395           context->eigen_device<Device>(), result->tensor<T, NDIM>(),
    396           context->input(0).tensor<T, NDIM>(), indices, sizes, first);
    397       first = false;
    398       // Increment the begin indices.
    399       int i = 0;
    400       while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) {
    401         indices[i] = 0;
    402         ++i;
    403       }
    404       // We are finished if we have iterated to the maximum along all
    405       // dimensions.
    406       if (i == NDIM) {
    407         break;
    408       }
    409       indices[i] += sizes[i];
    410     }
    411   }
    412 
    413   template <typename T, int NDIM, int REDUCENDIM>
    414   void HandleReduce(OpKernelContext* context,
    415                     const std::vector<Tmultiples>& reduce_dim_in,
    416                     Tensor* result) {
    417     static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions");
    418     Eigen::DSizes<Eigen::DenseIndex, REDUCENDIM> reduce_dim;
    419     Eigen::DSizes<Eigen::DenseIndex, NDIM> reshape_dim;
    420 
    421     for (int i = 0; i < REDUCENDIM; ++i) {
    422       reduce_dim[i] = reduce_dim_in[i];
    423     }
    424 
    425     for (int i = 0; i < NDIM; ++i) {
    426       reshape_dim[i] = result->dim_size(i);
    427     }
    428 
    429     functor::ReduceAndReshape<Device, T, NDIM, REDUCENDIM>()(
    430         context->eigen_device<Device>(), result->tensor<T, NDIM>(),
    431         context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim);
    432   }
    433 
    434   TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp);
    435 };
    436 
    437 template <typename Device, typename Tmultiples>
    438 template <DataType DT, int NDIM>
    439 inline void TileGradientOp<Device, Tmultiples>::HandleCase(
    440     OpKernelContext* context, const std::vector<Tmultiples>& input_dims,
    441     const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
    442   LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
    443              << MakeTypeIndex<Device>().name() << ", " << DataTypeString(DT)
    444              << ", " << NDIM;
    445 }
    446 
    447 #define HANDLE_CASE(device, T, dtype, Tmultiples, ndim)                        \
    448   template <>                                                                  \
    449   template <>                                                                  \
    450   void TileGradientOp<device, Tmultiples>::HandleCase<dtype, ndim>(            \
    451       OpKernelContext * context, const std::vector<Tmultiples>& input_dims,    \
    452       const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {    \
    453     HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \
    454   }
    455 
    456 // 0-D handled specially above
    457 #define HANDLE_CASE_DIM(device, T, dtype)  \
    458   HANDLE_CASE(device, T, dtype, int32, 1); \
    459   HANDLE_CASE(device, T, dtype, int32, 2); \
    460   HANDLE_CASE(device, T, dtype, int32, 3); \
    461   HANDLE_CASE(device, T, dtype, int32, 4); \
    462   HANDLE_CASE(device, T, dtype, int32, 5); \
    463   HANDLE_CASE(device, T, dtype, int32, 6); \
    464   HANDLE_CASE(device, T, dtype, int32, 7); \
    465   HANDLE_CASE(device, T, dtype, int64, 1); \
    466   HANDLE_CASE(device, T, dtype, int64, 2); \
    467   HANDLE_CASE(device, T, dtype, int64, 3); \
    468   HANDLE_CASE(device, T, dtype, int64, 4); \
    469   HANDLE_CASE(device, T, dtype, int64, 5); \
    470   HANDLE_CASE(device, T, dtype, int64, 6); \
    471   HANDLE_CASE(device, T, dtype, int64, 7);
    472 
    473 #define HANDLE_TYPE_NAME_CPU(T) \
    474   HANDLE_CASE_DIM(CPUDevice, T, DataTypeToEnum<T>::value);
    475 
    476 #define HANDLE_TYPE_NAME_GPU(T) \
    477   HANDLE_CASE_DIM(GPUDevice, T, DataTypeToEnum<T>::value);
    478 
    479 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
    480 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
    481 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
    482 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
    483 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
    484 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
    485 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
    486 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
    487 
    488 #if GOOGLE_CUDA
    489 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
    490 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
    491 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
    492 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
    493 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
    494 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
    495 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
    496 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
    497 #endif  // GOOGLE_CUDA
    498 
    499 #if TENSORFLOW_USE_SYCL
    500 #define HANDLE_TYPE_NAME_SYCL(T) \
    501   HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
    502 
    503 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
    504 TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
    505 TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
    506 TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
    507 TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
    508 #undef HANDLE_TYPE_NAME_SYCL
    509 #endif  // TENSORFLOW_USE_SYCL
    510 
    511 #undef HANDLE_TYPE_NAME_CPU
    512 #undef HANDLE_TYPE_NAME_GPU
    513 #undef HANDLE_CASE_DIM
    514 #undef HANDLE_CASE
    515 
    516 REGISTER_KERNEL_BUILDER(Name("Tile")
    517                             .Device(DEVICE_CPU)
    518                             .HostMemory("multiples")
    519                             .TypeConstraint<int32>("Tmultiples"),
    520                         TileOp<CPUDevice, int32>);
    521 REGISTER_KERNEL_BUILDER(Name("Tile")
    522                             .Device(DEVICE_CPU)
    523                             .HostMemory("multiples")
    524                             .TypeConstraint<int64>("Tmultiples"),
    525                         TileOp<CPUDevice, int64>);
    526 REGISTER_KERNEL_BUILDER(Name("TileGrad")
    527                             .Device(DEVICE_CPU)
    528                             .HostMemory("multiples")
    529                             .TypeConstraint<int32>("Tmultiples"),
    530                         TileGradientOp<CPUDevice, int32>);
    531 REGISTER_KERNEL_BUILDER(Name("TileGrad")
    532                             .Device(DEVICE_CPU)
    533                             .HostMemory("multiples")
    534                             .TypeConstraint<int64>("Tmultiples"),
    535                         TileGradientOp<CPUDevice, int64>);
    536 
    537 #if GOOGLE_CUDA
    538 #define REGISTER_GPU_TILE(type)                                    \
    539   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
    540                               .Device(DEVICE_GPU)                  \
    541                               .TypeConstraint<type>("T")           \
    542                               .TypeConstraint<int32>("Tmultiples") \
    543                               .HostMemory("multiples"),            \
    544                           TileOp<GPUDevice, int32>);               \
    545   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
    546                               .Device(DEVICE_GPU)                  \
    547                               .TypeConstraint<type>("T")           \
    548                               .TypeConstraint<int64>("Tmultiples") \
    549                               .HostMemory("multiples"),            \
    550                           TileOp<GPUDevice, int64>);
    551 
    552 #define REGISTER_GPU_TILE_GRAD(type)                               \
    553   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
    554                               .Device(DEVICE_GPU)                  \
    555                               .TypeConstraint<type>("T")           \
    556                               .TypeConstraint<int32>("Tmultiples") \
    557                               .HostMemory("multiples"),            \
    558                           TileGradientOp<GPUDevice, int32>);       \
    559   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
    560                               .Device(DEVICE_GPU)                  \
    561                               .TypeConstraint<type>("T")           \
    562                               .TypeConstraint<int64>("Tmultiples") \
    563                               .HostMemory("multiples"),            \
    564                           TileGradientOp<GPUDevice, int64>);
    565 
    566 #define REGISTER_GPU(type) \
    567   REGISTER_GPU_TILE(type); \
    568   REGISTER_GPU_TILE_GRAD(type);
    569 
    570 TF_CALL_bool(REGISTER_GPU_TILE);
    571 TF_CALL_float(REGISTER_GPU);
    572 TF_CALL_double(REGISTER_GPU);
    573 TF_CALL_half(REGISTER_GPU);
    574 TF_CALL_int16(REGISTER_GPU);
    575 TF_CALL_int32(REGISTER_GPU);
    576 TF_CALL_complex64(REGISTER_GPU);
    577 TF_CALL_complex128(REGISTER_GPU)
    578 
    579 #undef REGISTER_GPU_TILE
    580 #undef REGISTER_GPU_TILE_GRAD
    581 #undef REGISTER_GPU
    582 #endif  // GOOGLE_CUDA
    583 
    584 #ifdef TENSORFLOW_USE_SYCL
    585 #define REGISTER_SYCL(type)                                        \
    586   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
    587                               .Device(DEVICE_SYCL)                 \
    588                               .TypeConstraint<type>("T")           \
    589                               .TypeConstraint<int32>("Tmultiples") \
    590                               .HostMemory("multiples"),            \
    591                           TileOp<SYCLDevice, int32>);              \
    592   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
    593                               .Device(DEVICE_SYCL)                 \
    594                               .TypeConstraint<type>("T")           \
    595                               .TypeConstraint<int64>("Tmultiples") \
    596                               .HostMemory("multiples"),            \
    597                           TileOp<SYCLDevice, int64>);              \
    598   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
    599                               .Device(DEVICE_SYCL)                 \
    600                               .TypeConstraint<type>("T")           \
    601                               .TypeConstraint<int32>("Tmultiples") \
    602                               .HostMemory("multiples"),            \
    603                           TileGradientOp<SYCLDevice, int32>);      \
    604   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
    605                               .Device(DEVICE_SYCL)                 \
    606                               .TypeConstraint<type>("T")           \
    607                               .TypeConstraint<int64>("Tmultiples") \
    608                               .HostMemory("multiples"),            \
    609                           TileGradientOp<SYCLDevice, int64>);
    610 
    611     TF_CALL_float(REGISTER_SYCL);
    612 TF_CALL_double(REGISTER_SYCL);
    613 
    614 #undef REGISTER_SYCL
    615 #endif  // TENSORFLOW_USE_SYCL
    616 
    617 }  // namespace tensorflow
    618