Home | History | Annotate | Download | only in Tensor
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (at) gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
     12 
     13 namespace Eigen {
     14 
     15 /** \class TensorConvolution
     16   * \ingroup CXX11_Tensor_Module
     17   *
     18   * \brief Tensor convolution class.
     19   *
     20   *
     21   */
     22 namespace internal {
     23 
     24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
     25 class IndexMapper {
     26  public:
     27   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
     28               const array<Index, NumKernelDims>& indices) {
     29 
     30     array<Index, NumDims> dimensions = input_dims;
     31     for (int i = 0; i < NumKernelDims; ++i) {
     32       const Index index = indices[i];
     33       const Index input_dim = input_dims[index];
     34       const Index kernel_dim = kernel_dims[i];
     35       const Index result_dim = input_dim - kernel_dim + 1;
     36       dimensions[index] = result_dim;
     37     }
     38 
     39     array<Index, NumDims> inputStrides;
     40     array<Index, NumDims> outputStrides;
     41     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
     42       inputStrides[0] = 1;
     43       outputStrides[0] = 1;
     44       for (int i = 1; i < NumDims; ++i) {
     45         inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
     46         outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
     47       }
     48     } else {
     49       inputStrides[NumDims - 1] = 1;
     50       outputStrides[NumDims - 1] = 1;
     51       for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
     52         inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
     53         outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
     54       }
     55     }
     56 
     57     array<Index, NumDims> cudaInputDimensions;
     58     array<Index, NumDims> cudaOutputDimensions;
     59     array<Index, NumDims> tmp = dimensions;
     60     array<Index, NumDims> ordering;
     61     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
     62                               ? 0
     63                               : NumDims - NumKernelDims;
     64     for (int i = 0; i < NumKernelDims; ++i) {
     65       const Index index = i + offset;
     66       ordering[index] = indices[i];
     67       tmp[indices[i]] = -1;
     68       cudaInputDimensions[index] = input_dims[indices[i]];
     69       cudaOutputDimensions[index] = dimensions[indices[i]];
     70     }
     71 
     72     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
     73                       ? NumKernelDims
     74                       : 0;
     75     for (int i = 0; i < NumDims; ++i) {
     76       if (tmp[i] >= 0) {
     77         ordering[written] = i;
     78         cudaInputDimensions[written] = input_dims[i];
     79         cudaOutputDimensions[written] = dimensions[i];
     80         ++written;
     81       }
     82     }
     83 
     84     for (int i = 0; i < NumDims; ++i) {
     85       m_inputStrides[i] = inputStrides[ordering[i]];
     86       m_outputStrides[i] = outputStrides[ordering[i]];
     87     }
     88 
     89     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
     90       for (int i = 0; i < NumDims; ++i) {
     91         if (i > NumKernelDims) {
     92           m_cudaInputStrides[i] =
     93               m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
     94           m_cudaOutputStrides[i] =
     95               m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
     96         } else {
     97           m_cudaInputStrides[i] = 1;
     98           m_cudaOutputStrides[i] = 1;
     99         }
    100       }
    101     } else {
    102       for (int i = NumDims - 1; i >= 0; --i) {
    103         if (i + 1 < offset) {
    104           m_cudaInputStrides[i] =
    105               m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
    106           m_cudaOutputStrides[i] =
    107               m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
    108         } else {
    109           m_cudaInputStrides[i] = 1;
    110           m_cudaOutputStrides[i] = 1;
    111         }
    112       }
    113     }
    114   }
    115 
    116   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
    117     Index inputIndex = 0;
    118     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    119       for (int d = NumDims - 1; d > NumKernelDims; --d) {
    120         const Index idx = p / m_cudaInputStrides[d];
    121         inputIndex += idx * m_inputStrides[d];
    122         p -= idx * m_cudaInputStrides[d];
    123       }
    124       inputIndex += p * m_inputStrides[NumKernelDims];
    125     } else {
    126       std::ptrdiff_t limit = 0;
    127       if (NumKernelDims < NumDims) {
    128         limit = NumDims - NumKernelDims - 1;
    129       }
    130       for (int d = 0; d < limit; ++d) {
    131         const Index idx = p / m_cudaInputStrides[d];
    132         inputIndex += idx * m_inputStrides[d];
    133         p -= idx * m_cudaInputStrides[d];
    134       }
    135       inputIndex += p * m_inputStrides[limit];
    136     }
    137     return inputIndex;
    138   }
    139 
    140   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
    141     Index outputIndex = 0;
    142     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    143       for (int d = NumDims - 1; d > NumKernelDims; --d) {
    144         const Index idx = p / m_cudaOutputStrides[d];
    145         outputIndex += idx * m_outputStrides[d];
    146         p -= idx * m_cudaOutputStrides[d];
    147       }
    148       outputIndex += p * m_outputStrides[NumKernelDims];
    149     } else {
    150       std::ptrdiff_t limit = 0;
    151       if (NumKernelDims < NumDims) {
    152         limit = NumDims - NumKernelDims - 1;
    153       }
    154       for (int d = 0; d < limit; ++d) {
    155         const Index idx = p / m_cudaOutputStrides[d];
    156         outputIndex += idx * m_outputStrides[d];
    157         p -= idx * m_cudaOutputStrides[d];
    158       }
    159       outputIndex += p * m_outputStrides[limit];
    160     }
    161     return outputIndex;
    162   }
    163 
    164   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
    165     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    166                               ? 0
    167                               : NumDims - NumKernelDims;
    168     return i * m_inputStrides[offset];
    169   }
    170 
    171   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
    172     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    173                               ? 0
    174                               : NumDims - NumKernelDims;
    175     return i * m_outputStrides[offset];
    176   }
    177 
    178   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
    179     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    180                               ? 0
    181                               : NumDims - NumKernelDims;
    182     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
    183   }
    184 
    185   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
    186     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    187                               ? 0
    188                               : NumDims - NumKernelDims;
    189     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
    190   }
    191 
    192   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
    193     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    194                               ? 0
    195                               : NumDims - NumKernelDims;
    196     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
    197            k * m_inputStrides[offset + 2];
    198   }
    199 
    200   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
    201     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    202                               ? 0
    203                               : NumDims - NumKernelDims;
    204     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
    205            k * m_outputStrides[offset + 2];
    206   }
    207 
    208  private:
    209   static const int NumDims = internal::array_size<InputDims>::value;
    210   array<Index, NumDims> m_inputStrides;
    211   array<Index, NumDims> m_outputStrides;
    212   array<Index, NumDims> m_cudaInputStrides;
    213   array<Index, NumDims> m_cudaOutputStrides;
    214 };
    215 
    216 
    217 
    218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
    219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
    220 {
    221   // Type promotion to handle the case where the types of the lhs and the rhs are different.
    222   typedef typename promote_storage_type<typename InputXprType::Scalar,
    223                                         typename KernelXprType::Scalar>::ret Scalar;
    224   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
    225                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;
    226   typedef typename promote_index_type<typename traits<InputXprType>::Index,
    227                                       typename traits<KernelXprType>::Index>::type Index;
    228   typedef typename InputXprType::Nested LhsNested;
    229   typedef typename KernelXprType::Nested RhsNested;
    230   typedef typename remove_reference<LhsNested>::type _LhsNested;
    231   typedef typename remove_reference<RhsNested>::type _RhsNested;
    232   static const int NumDimensions = traits<InputXprType>::NumDimensions;
    233   static const int Layout = traits<InputXprType>::Layout;
    234 
    235   enum {
    236     Flags = 0
    237   };
    238 };
    239 
    240 template<typename Dimensions, typename InputXprType, typename KernelXprType>
    241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
    242 {
    243   typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
    244 };
    245 
    246 template<typename Dimensions, typename InputXprType, typename KernelXprType>
    247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
    248 {
    249   typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
    250 };
    251 
    252 }  // end namespace internal
    253 
    254 
    255 
    256 template<typename Indices, typename InputXprType, typename KernelXprType>
    257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
    258 {
    259   public:
    260   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
    261   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
    262   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
    263                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
    264   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
    265   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
    266   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
    267 
    268   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
    269       : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
    270 
    271     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    272     const Indices& indices() const { return m_indices; }
    273 
    274     /** \returns the nested expressions */
    275     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    276     const typename internal::remove_all<typename InputXprType::Nested>::type&
    277     inputExpression() const { return m_input_xpr; }
    278 
    279     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    280     const typename internal::remove_all<typename KernelXprType::Nested>::type&
    281     kernelExpression() const { return m_kernel_xpr; }
    282 
    283   protected:
    284     typename InputXprType::Nested m_input_xpr;
    285     typename KernelXprType::Nested m_kernel_xpr;
    286     const Indices m_indices;
    287 };
    288 
    289 
    290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
    291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
    292 {
    293   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
    294 
    295   static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
    296   static const int NumKernelDims = internal::array_size<Indices>::value;
    297   typedef typename XprType::Index Index;
    298   typedef DSizes<Index, NumDims> Dimensions;
    299 
    300   typedef typename XprType::Scalar Scalar;
    301   typedef typename XprType::CoeffReturnType CoeffReturnType;
    302   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
    303   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
    304 
    305   enum {
    306     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
    307     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
    308     Layout = TensorEvaluator<InputArgType, Device>::Layout,
    309     CoordAccess = false,  // to be implemented
    310     RawAccess = false
    311   };
    312 
    313   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    314       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
    315   {
    316     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
    317 
    318     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
    319     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
    320 
    321     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    322       m_inputStride[0] = 1;
    323       for (int i = 1; i < NumDims; ++i) {
    324         m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
    325       }
    326     } else {
    327       m_inputStride[NumDims - 1] = 1;
    328       for (int i = NumDims - 2; i >= 0; --i) {
    329         m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
    330       }
    331     }
    332 
    333     m_dimensions = m_inputImpl.dimensions();
    334     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    335       for (int i = 0; i < NumKernelDims; ++i) {
    336         const Index index = op.indices()[i];
    337         const Index input_dim = input_dims[index];
    338         const Index kernel_dim = kernel_dims[i];
    339         const Index result_dim = input_dim - kernel_dim + 1;
    340         m_dimensions[index] = result_dim;
    341         if (i > 0) {
    342           m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
    343         } else {
    344           m_kernelStride[0] = 1;
    345         }
    346         m_indexStride[i] = m_inputStride[index];
    347       }
    348 
    349       m_outputStride[0] = 1;
    350       for (int i = 1; i < NumDims; ++i) {
    351         m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
    352       }
    353     } else {
    354       for (int i = NumKernelDims - 1; i >= 0; --i) {
    355         const Index index = op.indices()[i];
    356         const Index input_dim = input_dims[index];
    357         const Index kernel_dim = kernel_dims[i];
    358         const Index result_dim = input_dim - kernel_dim + 1;
    359         m_dimensions[index] = result_dim;
    360         if (i < NumKernelDims - 1) {
    361           m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
    362         } else {
    363           m_kernelStride[NumKernelDims - 1] = 1;
    364         }
    365         m_indexStride[i] = m_inputStride[index];
    366       }
    367 
    368       m_outputStride[NumDims - 1] = 1;
    369       for (int i = NumDims - 2; i >= 0; --i) {
    370         m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
    371       }
    372     }
    373   }
    374 
    375   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
    376 
    377   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
    378     m_inputImpl.evalSubExprsIfNeeded(NULL);
    379     preloadKernel();
    380     return true;
    381   }
    382   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    383     m_inputImpl.cleanup();
    384     if (m_local_kernel) {
    385       m_device.deallocate((void*)m_kernel);
    386       m_local_kernel = false;
    387     }
    388     m_kernel = NULL;
    389   }
    390 
    391   void evalTo(typename XprType::Scalar* buffer) {
    392     evalSubExprsIfNeeded(NULL);
    393     for (int i = 0; i < dimensions().TotalSize(); ++i) {
    394       buffer[i] += coeff(i);
    395     }
    396     cleanup();
    397   }
    398 
    399   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    400   {
    401     CoeffReturnType result = CoeffReturnType(0);
    402     convolve(firstInput(index), 0, NumKernelDims-1, result);
    403     return result;
    404   }
    405 
    406   template<int LoadMode>
    407   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
    408   {
    409     Index indices[2] = {index, index+PacketSize-1};
    410     Index startInputs[2] = {0, 0};
    411     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    412       for (int i = NumDims - 1; i > 0; --i) {
    413         const Index idx0 = indices[0] / m_outputStride[i];
    414         const Index idx1 = indices[1] / m_outputStride[i];
    415         startInputs[0] += idx0 * m_inputStride[i];
    416         startInputs[1] += idx1 * m_inputStride[i];
    417         indices[0] -= idx0 * m_outputStride[i];
    418         indices[1] -= idx1 * m_outputStride[i];
    419       }
    420     } else {
    421       for (int i = 0; i < NumDims - 1; ++i) {
    422         const Index idx0 = indices[0] / m_outputStride[i];
    423         const Index idx1 = indices[1] / m_outputStride[i];
    424         startInputs[0] += idx0 * m_inputStride[i];
    425         startInputs[1] += idx1 * m_inputStride[i];
    426         indices[0] -= idx0 * m_outputStride[i];
    427         indices[1] -= idx1 * m_outputStride[i];
    428       }
    429     }
    430     startInputs[0] += indices[0];
    431     startInputs[1] += indices[1];
    432 
    433     if (startInputs[1]-startInputs[0] == PacketSize-1) {
    434       PacketReturnType result = internal::pset1<PacketReturnType>(0);
    435       convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
    436       return result;
    437     } else {
    438       EIGEN_ALIGN_MAX Scalar data[PacketSize];
    439       data[0] = Scalar(0);
    440       convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
    441       for (int i = 1; i < PacketSize-1; ++i) {
    442         data[i] = Scalar(0);
    443         convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
    444       }
    445       data[PacketSize-1] = Scalar(0);
    446       convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
    447       return internal::pload<PacketReturnType>(data);
    448     }
    449   }
    450 
    451   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
    452   costPerCoeff(bool vectorized) const {
    453     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
    454     // We ignore the use of fused multiply-add.
    455     const double convolve_compute_cost =
    456         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
    457     const double firstIndex_compute_cost =
    458         NumDims *
    459         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
    460          TensorOpCost::DivCost<Index>());
    461     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
    462            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
    463                           m_kernelImpl.costPerCoeff(vectorized) +
    464                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
    465                                        PacketSize));
    466   }
    467 
    468   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
    469 
    470  private:
    471   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
    472     Index startInput = 0;
    473     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    474       for (int i = NumDims - 1; i > 0; --i) {
    475         const Index idx = index / m_outputStride[i];
    476         startInput += idx * m_inputStride[i];
    477         index -= idx * m_outputStride[i];
    478       }
    479     } else {
    480       for (int i = 0; i < NumDims - 1; ++i) {
    481         const Index idx = index / m_outputStride[i];
    482         startInput += idx * m_inputStride[i];
    483         index -= idx * m_outputStride[i];
    484       }
    485     }
    486     startInput += index;
    487     return startInput;
    488   }
    489 
    490   EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
    491     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
    492       const Index input = firstIndex + j * m_indexStride[DimIndex];
    493       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
    494       if (DimIndex > 0) {
    495         convolve(input, kernel, DimIndex-1, accum);
    496       } else {
    497         accum += m_inputImpl.coeff(input) * m_kernel[kernel];
    498       }
    499     }
    500   }
    501 
    502   template <typename Packet>
    503   EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
    504     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
    505       const Index input = firstIndex + j * m_indexStride[DimIndex];
    506       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
    507       if (DimIndex > 0) {
    508         convolvePacket(input, kernel, DimIndex-1, accum);
    509       } else {
    510         accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
    511       }
    512     }
    513   }
    514 
    515   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
    516     // Don't make a local copy of the kernel unless we have to (i.e. it's an
    517     // expression that needs to be evaluated)
    518     const Scalar* in_place = m_kernelImpl.data();
    519     if (in_place) {
    520       m_kernel = in_place;
    521       m_local_kernel = false;
    522     } else {
    523       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
    524       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
    525       typedef TensorEvalToOp<const KernelArgType> EvalTo;
    526       EvalTo evalToTmp(local, m_kernelArg);
    527       const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
    528       internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
    529 
    530       m_kernel = local;
    531       m_local_kernel = true;
    532     }
    533   }
    534 
    535   array<Index, NumDims> m_inputStride;
    536   array<Index, NumDims> m_outputStride;
    537 
    538   array<Index, NumKernelDims> m_indexStride;
    539   array<Index, NumKernelDims> m_kernelStride;
    540   TensorEvaluator<InputArgType, Device> m_inputImpl;
    541   TensorEvaluator<KernelArgType, Device> m_kernelImpl;
    542   Dimensions m_dimensions;
    543 
    544   KernelArgType m_kernelArg;
    545   const Scalar* m_kernel;
    546   bool m_local_kernel;
    547   const Device& m_device;
    548 };
    549 
    550 
    551 
    552 
    553 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
    554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
    555 
    556 template <int StaticKernelSize>
    557 struct GetKernelSize {
    558   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
    559     return StaticKernelSize;
    560   }
    561 };
    562 template <>
    563 struct GetKernelSize<Dynamic> {
    564   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
    565     return kernelSize;
    566   }
    567 };
    568 
    569 template <typename InputEvaluator, typename Index, typename InputDims,
    570           int StaticKernelSize>
    571 __global__ void EigenConvolutionKernel1D(
    572     InputEvaluator eval,
    573     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
    574         indexMapper,
    575     const float* __restrict kernel, const int numPlanes, const int numX,
    576     const int maxX, const int kernelSize, float* buffer) {
    577   extern __shared__ float s[];
    578 
    579   const int first_x = blockIdx.x * maxX;
    580   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
    581   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
    582   const int num_x_output = last_x - first_x + 1;
    583 
    584   const int first_plane = blockIdx.y * blockDim.y;
    585   const int plane_stride = blockDim.y * gridDim.y;
    586 
    587   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
    588     // Load inputs to shared memory
    589     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
    590     const int plane_kernel_offset = threadIdx.y * num_x_input;
    591     #pragma unroll
    592     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
    593       const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
    594       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
    595     }
    596 
    597     __syncthreads();
    598 
    599     // Compute the convolution
    600     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
    601 
    602     #pragma unroll
    603     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
    604       const int kernel_offset = plane_kernel_offset + i;
    605       float result = 0.0f;
    606       #pragma unroll
    607       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
    608         result += s[k + kernel_offset] * kernel[k];
    609       }
    610       const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
    611       buffer[tensor_index] = result;
    612     }
    613     __syncthreads();
    614   }
    615 };
    616 
    617 template <typename InputEvaluator, typename Index, typename InputDims,
    618           int StaticKernelSizeX, int StaticKernelSizeY>
    619 __global__ void EigenConvolutionKernel2D(
    620     InputEvaluator eval,
    621     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
    622         indexMapper,
    623     const float* __restrict kernel, const int numPlanes, const int numX,
    624     const int maxX, const int numY, const int maxY, const int kernelSizeX,
    625     const int kernelSizeY, float* buffer) {
    626   extern __shared__ float s[];
    627 
    628   const int first_x = blockIdx.x * maxX;
    629   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
    630   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
    631   const int num_x_output = last_x - first_x + 1;
    632 
    633   const int first_y = blockIdx.y * maxY;
    634   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
    635   const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
    636   const int num_y_output = last_y - first_y + 1;
    637 
    638   const int first_plane = blockIdx.z * blockDim.z;
    639   const int plane_stride = blockDim.z * gridDim.z;
    640 
    641   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
    642 
    643     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
    644     const int plane_kernel_offset = threadIdx.z * num_y_input;
    645 
    646     // Load inputs to shared memory
    647     #pragma unroll
    648     for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
    649       const int input_offset = num_x_input * (j + plane_kernel_offset);
    650       #pragma unroll
    651       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
    652         const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
    653         s[i + input_offset] = eval.coeff(tensor_index);
    654       }
    655     }
    656 
    657     __syncthreads();
    658 
    659     // Convolution
    660     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
    661 
    662     #pragma unroll
    663     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
    664       #pragma unroll
    665       for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
    666         float result = 0.0f;
    667         #pragma unroll
    668         for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
    669           const int kernel_offset = kernelSizeX * l;
    670           const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
    671           #pragma unroll
    672           for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
    673             result += s[k + input_offset] * kernel[k + kernel_offset];
    674           }
    675         }
    676         const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
    677         buffer[tensor_index] = result;
    678       }
    679     }
    680 
    681     __syncthreads();
    682   }
    683 };
    684 
    685 template <typename InputEvaluator, typename Index, typename InputDims>
    686 __global__ void EigenConvolutionKernel3D(
    687     InputEvaluator eval,
    688     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
    689         indexMapper,
    690     const float* __restrict kernel, const size_t numPlanes, const size_t numX,
    691     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
    692     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
    693     const size_t kernelSizeZ, float* buffer) {
    694   extern __shared__ float s[];
    695 
    696   // Load inputs to shared memory
    697   const int first_x = blockIdx.x * maxX;
    698   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
    699   const int num_x_input = last_x - first_x + kernelSizeX;
    700 
    701   const int first_y = blockIdx.y * maxY;
    702   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
    703   const int num_y_input = last_y - first_y + kernelSizeY;
    704 
    705   const int first_z = blockIdx.z * maxZ;
    706   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
    707   const int num_z_input = last_z - first_z + kernelSizeZ;
    708 
    709   for (int p = 0; p < numPlanes; ++p) {
    710 
    711     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
    712     const int plane_kernel_offset = 0;
    713 
    714     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
    715       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
    716         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
    717           const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
    718           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
    719         }
    720       }
    721     }
    722 
    723     __syncthreads();
    724 
    725     // Convolution
    726     const int num_z_output = last_z - first_z + 1;
    727     const int num_y_output = last_y - first_y + 1;
    728     const int num_x_output = last_x - first_x + 1;
    729     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
    730 
    731     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
    732       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
    733         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
    734           float result = 0.0f;
    735           for (int n = 0; n < kernelSizeZ; ++n) {
    736             for (int m = 0; m < kernelSizeY; ++m) {
    737               for (int l = 0; l < kernelSizeX; ++l) {
    738                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
    739               }
    740             }
    741           }
    742           const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
    743           buffer[tensor_index] = result;
    744         }
    745       }
    746     }
    747     __syncthreads();
    748   }
    749 };
    750 
    751 
    752 
    753 template<typename Indices, typename InputArgType, typename KernelArgType>
    754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
    755 {
    756   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
    757 
    758   static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
    759   static const int NumKernelDims = internal::array_size<Indices>::value;
    760   typedef typename XprType::Index Index;
    761   typedef DSizes<Index, NumDims> Dimensions;
    762   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
    763 
    764   enum {
    765     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
    766     PacketAccess = false,
    767     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
    768     CoordAccess = false,  // to be implemented
    769     RawAccess = false
    770   };
    771 
    772   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
    773       : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
    774   {
    775     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
    776 
    777     const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
    778     const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
    779 
    780     m_dimensions = m_inputImpl.dimensions();
    781     for (int i = 0; i < NumKernelDims; ++i) {
    782       const Index index = op.indices()[i];
    783       const Index input_dim = input_dims[index];
    784       const Index kernel_dim = kernel_dims[i];
    785       const Index result_dim = input_dim - kernel_dim + 1;
    786       m_dimensions[index] = result_dim;
    787     }
    788   }
    789 
    790   typedef typename XprType::CoeffReturnType CoeffReturnType;
    791   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
    792   typedef typename InputArgType::Scalar Scalar;
    793   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
    794 
    795   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
    796 
    797   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
    798     preloadKernel();
    799     m_inputImpl.evalSubExprsIfNeeded(NULL);
    800     if (data) {
    801       executeEval(data);
    802       return false;
    803     } else {
    804       m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
    805       executeEval(m_buf);
    806       return true;
    807     }
    808   }
    809 
    810   EIGEN_STRONG_INLINE void cleanup() {
    811     m_inputImpl.cleanup();
    812     if (m_buf) {
    813       m_device.deallocate(m_buf);
    814       m_buf = NULL;
    815     }
    816     if (m_local_kernel) {
    817       m_device.deallocate((void*)m_kernel);
    818       m_local_kernel = false;
    819     }
    820     m_kernel = NULL;
    821   }
    822 
    823   EIGEN_STRONG_INLINE void preloadKernel() {
    824     // Don't make a local copy of the kernel unless we have to (i.e. it's an
    825     // expression that needs to be evaluated)
    826     const Scalar* in_place = m_kernelImpl.data();
    827     if (in_place) {
    828       m_kernel = in_place;
    829       m_local_kernel = false;
    830     } else {
    831       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
    832       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
    833       typedef TensorEvalToOp<const KernelArgType> EvalTo;
    834       EvalTo evalToTmp(local, m_kernelArg);
    835       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
    836       internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
    837 
    838       m_kernel = local;
    839       m_local_kernel = true;
    840     }
    841   }
    842 
    843   static unsigned int ceil(unsigned int num, unsigned int denom) {
    844     const unsigned int rounded_toward_zero = num / denom;
    845     if (num > rounded_toward_zero * denom) {
    846       return rounded_toward_zero + 1;
    847     }
    848     return rounded_toward_zero;
    849   }
    850 
    851   void executeEval(Scalar* data) const {
    852     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
    853 
    854     const int maxSharedMem = m_device.sharedMemPerBlock();
    855     const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
    856     const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
    857     const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
    858     const int warpSize = 32;
    859 
    860     switch (NumKernelDims) {
    861       case 1: {
    862         const int kernel_size = m_kernelImpl.dimensions().TotalSize();
    863 
    864         const int numX = dimensions()[m_indices[0]];
    865         const int numP = dimensions().TotalSize() / numX;
    866         int maxX;
    867         dim3 block_size;
    868 
    869         const int single_stride_dim =
    870             static_cast<int>(Layout) == static_cast<int>(ColMajor)
    871                 ? 0
    872                 : m_inputImpl.dimensions().rank() - 1;
    873         if (m_indices[0] == single_stride_dim) {
    874           // Maximum the reuse
    875           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
    876           maxX = numext::mini<int>(inner_dim, numX);
    877           const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
    878           block_size.x = numext::mini(maxThreadsPerBlock, maxX);
    879           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
    880         }
    881         else {
    882           // Read as much as possible alongside the inner most dimension, that is the plane
    883           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
    884           const int maxP = numext::mini<int>(inner_dim, numP);
    885           maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
    886 
    887           block_size.x = numext::mini(warpSize, maxX);
    888           block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
    889         }
    890 
    891         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
    892         assert(shared_mem <= maxSharedMem);
    893 
    894         const int num_x_blocks = ceil(numX, maxX);
    895         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
    896         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
    897 
    898         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
    899 
    900 
    901         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
    902 
    903         const array<Index, 1> indices(m_indices[0]);
    904         const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
    905         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
    906             m_inputImpl.dimensions(), kernel_dims, indices);
    907         switch(kernel_size) {
    908           case 4: {
    909             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
    910             break;
    911           }
    912           case 7: {
    913             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
    914             break;
    915           }
    916           default: {
    917             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
    918           }
    919         }
    920         break;
    921       }
    922 
    923       case 2: {
    924         const int idxX =
    925             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
    926         const int idxY =
    927             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
    928         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
    929         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
    930 
    931         const int numX = dimensions()[m_indices[idxX]];
    932         const int numY = dimensions()[m_indices[idxY]];
    933         const int numP = dimensions().TotalSize() / (numX*numY);
    934 
    935         const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
    936 
    937         // Snap maxX to warp size
    938         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
    939         const int maxX = numext::mini<int>(inner_dim, numX);
    940         const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
    941         const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
    942 
    943         dim3 block_size;
    944         block_size.x = numext::mini(1024, maxX);
    945         block_size.y = numext::mini<int>(1024/block_size.x, maxY);
    946         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
    947 
    948         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
    949         assert(shared_mem <= maxSharedMem);
    950 
    951         const int num_x_blocks = ceil(numX, maxX);
    952         const int num_y_blocks = ceil(numY, maxY);
    953         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
    954         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
    955 
    956         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
    957 
    958 
    959         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
    960 
    961         const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
    962         const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
    963                                           m_kernelImpl.dimensions()[idxY]);
    964         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
    965             m_inputImpl.dimensions(), kernel_dims, indices);
    966         switch (kernel_size_x) {
    967           case 4: {
    968             switch (kernel_size_y) {
    969               case 7: {
    970                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
    971                 break;
    972               }
    973               default: {
    974                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
    975                 break;
    976               }
    977             }
    978             break;
    979           }
    980           case 7: {
    981             switch (kernel_size_y) {
    982               case 4: {
    983                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
    984                 break;
    985               }
    986               default: {
    987                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
    988                 break;
    989               }
    990             }
    991             break;
    992           }
    993           default: {
    994             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
    995             break;
    996           }
    997         }
    998         break;
    999       }
   1000 
   1001       case 3: {
   1002         const int idxX =
   1003             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
   1004         const int idxY =
   1005             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
   1006         const int idxZ =
   1007             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
   1008 
   1009         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
   1010         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
   1011         const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
   1012 
   1013         const int numX = dimensions()[m_indices[idxX]];
   1014         const int numY = dimensions()[m_indices[idxY]];
   1015         const int numZ = dimensions()[m_indices[idxZ]];
   1016         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
   1017 
   1018         const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
   1019         const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
   1020         const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
   1021 
   1022         dim3 block_size;
   1023         block_size.x = numext::mini(32, maxX);
   1024         block_size.y = numext::mini(32, maxY);
   1025         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
   1026         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
   1027 
   1028         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
   1029         assert(shared_mem <= maxSharedMem);
   1030 
   1031         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
   1032         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
   1033                                       m_indices[idxZ]);
   1034         const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
   1035                                           m_kernelImpl.dimensions()[idxY],
   1036                                           m_kernelImpl.dimensions()[idxZ]);
   1037         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
   1038             m_inputImpl.dimensions(), kernel_dims, indices);
   1039 
   1040         LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
   1041         break;
   1042       }
   1043 
   1044       default: {
   1045         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
   1046       }
   1047     }
   1048   }
   1049 
   1050   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   1051   {
   1052     eigen_assert(m_buf);
   1053     eigen_assert(index < m_dimensions.TotalSize());
   1054     return m_buf[index];
   1055   }
   1056 
   1057   template<int LoadMode>
   1058   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
   1059   {
   1060     eigen_assert(m_buf);
   1061     eigen_assert(index < m_dimensions.TotalSize());
   1062     return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
   1063   }
   1064 
   1065   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   1066   costPerCoeff(bool vectorized) const {
   1067     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
   1068     // model.
   1069     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
   1070     // We ignore the use of fused multiply-add.
   1071     const double convolve_compute_cost =
   1072         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
   1073     const double firstIndex_compute_cost =
   1074         NumDims *
   1075         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
   1076          TensorOpCost::DivCost<Index>());
   1077     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
   1078            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
   1079                           m_kernelImpl.costPerCoeff(vectorized) +
   1080                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
   1081                                        PacketSize));
   1082   }
   1083 
   1084  private:
   1085   // No assignment (copies are needed by the kernels)
   1086   TensorEvaluator& operator = (const TensorEvaluator&);
   1087 
   1088   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
   1089   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
   1090   KernelArgType m_kernelArg;
   1091   Indices m_indices;
   1092   Dimensions m_dimensions;
   1093   Scalar* m_buf;
   1094   const Scalar* m_kernel;
   1095   bool m_local_kernel;
   1096 
   1097   const GpuDevice& m_device;
   1098 };
   1099 #endif
   1100 
   1101 
   1102 } // end namespace Eigen
   1103 
   1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
   1105