Home | History | Annotate | Download | only in Tensor
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Igor Babuschkin <igor (at) babuschk.in>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 template <typename Op, typename XprType>
     18 struct traits<TensorScanOp<Op, XprType> >
     19     : public traits<XprType> {
     20   typedef typename XprType::Scalar Scalar;
     21   typedef traits<XprType> XprTraits;
     22   typedef typename XprTraits::StorageKind StorageKind;
     23   typedef typename XprType::Nested Nested;
     24   typedef typename remove_reference<Nested>::type _Nested;
     25   static const int NumDimensions = XprTraits::NumDimensions;
     26   static const int Layout = XprTraits::Layout;
     27 };
     28 
     29 template<typename Op, typename XprType>
     30 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
     31 {
     32   typedef const TensorScanOp<Op, XprType>& type;
     33 };
     34 
     35 template<typename Op, typename XprType>
     36 struct nested<TensorScanOp<Op, XprType>, 1,
     37             typename eval<TensorScanOp<Op, XprType> >::type>
     38 {
     39   typedef TensorScanOp<Op, XprType> type;
     40 };
     41 } // end namespace internal
     42 
     43 /** \class TensorScan
     44   * \ingroup CXX11_Tensor_Module
     45   *
     46   * \brief Tensor scan class.
     47   */
     48 template <typename Op, typename XprType>
     49 class TensorScanOp
     50     : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
     51 public:
     52   typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
     53   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
     54   typedef typename XprType::CoeffReturnType CoeffReturnType;
     55   typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
     56   typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
     57   typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
     58 
     59   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
     60       const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
     61       : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
     62 
     63   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     64   const Index axis() const { return m_axis; }
     65   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     66   const XprType& expression() const { return m_expr; }
     67   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     68   const Op accumulator() const { return m_accumulator; }
     69   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     70   bool exclusive() const { return m_exclusive; }
     71 
     72 protected:
     73   typename XprType::Nested m_expr;
     74   const Index m_axis;
     75   const Op m_accumulator;
     76   const bool m_exclusive;
     77 };
     78 
     79 template <typename Self, typename Reducer, typename Device>
     80 struct ScanLauncher;
     81 
     82 // Eval as rvalue
     83 template <typename Op, typename ArgType, typename Device>
     84 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     85 
     86   typedef TensorScanOp<Op, ArgType> XprType;
     87   typedef typename XprType::Index Index;
     88   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
     89   typedef DSizes<Index, NumDims> Dimensions;
     90   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
     91   typedef typename XprType::CoeffReturnType CoeffReturnType;
     92   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
     93   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
     94 
     95   enum {
     96     IsAligned = false,
     97     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
     98     BlockAccess = false,
     99     Layout = TensorEvaluator<ArgType, Device>::Layout,
    100     CoordAccess = false,
    101     RawAccess = true
    102   };
    103 
    104   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
    105                                                         const Device& device)
    106       : m_impl(op.expression(), device),
    107         m_device(device),
    108         m_exclusive(op.exclusive()),
    109         m_accumulator(op.accumulator()),
    110         m_size(m_impl.dimensions()[op.axis()]),
    111         m_stride(1),
    112         m_output(NULL) {
    113 
    114     // Accumulating a scalar isn't supported.
    115     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    116     eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
    117 
    118     // Compute stride of scan axis
    119     const Dimensions& dims = m_impl.dimensions();
    120     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    121       for (int i = 0; i < op.axis(); ++i) {
    122         m_stride = m_stride * dims[i];
    123       }
    124     } else {
    125       for (int i = NumDims - 1; i > op.axis(); --i) {
    126         m_stride = m_stride * dims[i];
    127       }
    128     }
    129   }
    130 
    131   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
    132     return m_impl.dimensions();
    133   }
    134 
    135   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
    136     return m_stride;
    137   }
    138 
    139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
    140     return m_size;
    141   }
    142 
    143   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
    144     return m_accumulator;
    145   }
    146 
    147   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
    148     return m_exclusive;
    149   }
    150 
    151   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
    152     return m_impl;
    153   }
    154 
    155   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
    156     return m_device;
    157   }
    158 
    159   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
    160     m_impl.evalSubExprsIfNeeded(NULL);
    161     ScanLauncher<Self, Op, Device> launcher;
    162     if (data) {
    163       launcher(*this, data);
    164       return false;
    165     }
    166 
    167     const Index total_size = internal::array_prod(dimensions());
    168     m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
    169     launcher(*this, m_output);
    170     return true;
    171   }
    172 
    173   template<int LoadMode>
    174   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
    175     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
    176   }
    177 
    178   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
    179   {
    180     return m_output;
    181   }
    182 
    183   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    184   {
    185     return m_output[index];
    186   }
    187 
    188   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
    189     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
    190   }
    191 
    192   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    193     if (m_output != NULL) {
    194       m_device.deallocate(m_output);
    195       m_output = NULL;
    196     }
    197     m_impl.cleanup();
    198   }
    199 
    200 protected:
    201   TensorEvaluator<ArgType, Device> m_impl;
    202   const Device& m_device;
    203   const bool m_exclusive;
    204   Op m_accumulator;
    205   const Index m_size;
    206   Index m_stride;
    207   CoeffReturnType* m_output;
    208 };
    209 
    210 // CPU implementation of scan
    211 // TODO(ibab) This single-threaded implementation should be parallelized,
    212 // at least by running multiple scans at the same time.
    213 template <typename Self, typename Reducer, typename Device>
    214 struct ScanLauncher {
    215   void operator()(Self& self, typename Self::CoeffReturnType *data) {
    216     Index total_size = internal::array_prod(self.dimensions());
    217 
    218     // We fix the index along the scan axis to 0 and perform a
    219     // scan per remaining entry. The iteration is split into two nested
    220     // loops to avoid an integer division by keeping track of each idx1 and idx2.
    221     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
    222       for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
    223         // Calculate the starting offset for the scan
    224         Index offset = idx1 + idx2;
    225 
    226         // Compute the scan along the axis, starting at the calculated offset
    227         typename Self::CoeffReturnType accum = self.accumulator().initialize();
    228         for (Index idx3 = 0; idx3 < self.size(); idx3++) {
    229           Index curr = offset + idx3 * self.stride();
    230 
    231           if (self.exclusive()) {
    232             data[curr] = self.accumulator().finalize(accum);
    233             self.accumulator().reduce(self.inner().coeff(curr), &accum);
    234           } else {
    235             self.accumulator().reduce(self.inner().coeff(curr), &accum);
    236             data[curr] = self.accumulator().finalize(accum);
    237           }
    238         }
    239       }
    240     }
    241   }
    242 };
    243 
    244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
    245 
    246 // GPU implementation of scan
    247 // TODO(ibab) This placeholder implementation performs multiple scans in
    248 // parallel, but it would be better to use a parallel scan algorithm and
    249 // optimize memory access.
    250 template <typename Self, typename Reducer>
    251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
    252   // Compute offset as in the CPU version
    253   Index val = threadIdx.x + blockIdx.x * blockDim.x;
    254   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
    255 
    256   if (offset + (self.size() - 1) * self.stride() < total_size) {
    257     // Compute the scan along the axis, starting at the calculated offset
    258     typename Self::CoeffReturnType accum = self.accumulator().initialize();
    259     for (Index idx = 0; idx < self.size(); idx++) {
    260       Index curr = offset + idx * self.stride();
    261       if (self.exclusive()) {
    262         data[curr] = self.accumulator().finalize(accum);
    263         self.accumulator().reduce(self.inner().coeff(curr), &accum);
    264       } else {
    265         self.accumulator().reduce(self.inner().coeff(curr), &accum);
    266         data[curr] = self.accumulator().finalize(accum);
    267       }
    268     }
    269   }
    270   __syncthreads();
    271 
    272 }
    273 
    274 template <typename Self, typename Reducer>
    275 struct ScanLauncher<Self, Reducer, GpuDevice> {
    276   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
    277      Index total_size = internal::array_prod(self.dimensions());
    278      Index num_blocks = (total_size / self.size() + 63) / 64;
    279      Index block_size = 64;
    280      LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
    281   }
    282 };
    283 #endif  // EIGEN_USE_GPU && __CUDACC__
    284 
    285 }  // end namespace Eigen
    286 
    287 #endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
    288