1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Igor Babuschkin <igor (at) babuschk.in> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 template <typename Op, typename XprType> 18 struct traits<TensorScanOp<Op, XprType> > 19 : public traits<XprType> { 20 typedef typename XprType::Scalar Scalar; 21 typedef traits<XprType> XprTraits; 22 typedef typename XprTraits::StorageKind StorageKind; 23 typedef typename XprType::Nested Nested; 24 typedef typename remove_reference<Nested>::type _Nested; 25 static const int NumDimensions = XprTraits::NumDimensions; 26 static const int Layout = XprTraits::Layout; 27 }; 28 29 template<typename Op, typename XprType> 30 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> 31 { 32 typedef const TensorScanOp<Op, XprType>& type; 33 }; 34 35 template<typename Op, typename XprType> 36 struct nested<TensorScanOp<Op, XprType>, 1, 37 typename eval<TensorScanOp<Op, XprType> >::type> 38 { 39 typedef TensorScanOp<Op, XprType> type; 40 }; 41 } // end namespace internal 42 43 /** \class TensorScan 44 * \ingroup CXX11_Tensor_Module 45 * 46 * \brief Tensor scan class. 47 */ 48 template <typename Op, typename XprType> 49 class TensorScanOp 50 : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { 51 public: 52 typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar; 53 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 54 typedef typename XprType::CoeffReturnType CoeffReturnType; 55 typedef typename Eigen::internal::nested<TensorScanOp>::type Nested; 56 typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind; 57 typedef typename Eigen::internal::traits<TensorScanOp>::Index Index; 58 59 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( 60 const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) 61 : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} 62 63 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 64 const Index axis() const { return m_axis; } 65 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 66 const XprType& expression() const { return m_expr; } 67 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 68 const Op accumulator() const { return m_accumulator; } 69 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 70 bool exclusive() const { return m_exclusive; } 71 72 protected: 73 typename XprType::Nested m_expr; 74 const Index m_axis; 75 const Op m_accumulator; 76 const bool m_exclusive; 77 }; 78 79 template <typename Self, typename Reducer, typename Device> 80 struct ScanLauncher; 81 82 // Eval as rvalue 83 template <typename Op, typename ArgType, typename Device> 84 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { 85 86 typedef TensorScanOp<Op, ArgType> XprType; 87 typedef typename XprType::Index Index; 88 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; 89 typedef DSizes<Index, NumDims> Dimensions; 90 typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; 91 typedef typename XprType::CoeffReturnType CoeffReturnType; 92 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 93 typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; 94 95 enum { 96 IsAligned = false, 97 PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), 98 BlockAccess = false, 99 Layout = TensorEvaluator<ArgType, Device>::Layout, 100 CoordAccess = false, 101 RawAccess = true 102 }; 103 104 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, 105 const Device& device) 106 : m_impl(op.expression(), device), 107 m_device(device), 108 m_exclusive(op.exclusive()), 109 m_accumulator(op.accumulator()), 110 m_size(m_impl.dimensions()[op.axis()]), 111 m_stride(1), 112 m_output(NULL) { 113 114 // Accumulating a scalar isn't supported. 115 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 116 eigen_assert(op.axis() >= 0 && op.axis() < NumDims); 117 118 // Compute stride of scan axis 119 const Dimensions& dims = m_impl.dimensions(); 120 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 121 for (int i = 0; i < op.axis(); ++i) { 122 m_stride = m_stride * dims[i]; 123 } 124 } else { 125 for (int i = NumDims - 1; i > op.axis(); --i) { 126 m_stride = m_stride * dims[i]; 127 } 128 } 129 } 130 131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { 132 return m_impl.dimensions(); 133 } 134 135 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { 136 return m_stride; 137 } 138 139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { 140 return m_size; 141 } 142 143 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { 144 return m_accumulator; 145 } 146 147 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { 148 return m_exclusive; 149 } 150 151 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { 152 return m_impl; 153 } 154 155 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { 156 return m_device; 157 } 158 159 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { 160 m_impl.evalSubExprsIfNeeded(NULL); 161 ScanLauncher<Self, Op, Device> launcher; 162 if (data) { 163 launcher(*this, data); 164 return false; 165 } 166 167 const Index total_size = internal::array_prod(dimensions()); 168 m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar))); 169 launcher(*this, m_output); 170 return true; 171 } 172 173 template<int LoadMode> 174 EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { 175 return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); 176 } 177 178 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const 179 { 180 return m_output; 181 } 182 183 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 184 { 185 return m_output[index]; 186 } 187 188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { 189 return TensorOpCost(sizeof(CoeffReturnType), 0, 0); 190 } 191 192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { 193 if (m_output != NULL) { 194 m_device.deallocate(m_output); 195 m_output = NULL; 196 } 197 m_impl.cleanup(); 198 } 199 200 protected: 201 TensorEvaluator<ArgType, Device> m_impl; 202 const Device& m_device; 203 const bool m_exclusive; 204 Op m_accumulator; 205 const Index m_size; 206 Index m_stride; 207 CoeffReturnType* m_output; 208 }; 209 210 // CPU implementation of scan 211 // TODO(ibab) This single-threaded implementation should be parallelized, 212 // at least by running multiple scans at the same time. 213 template <typename Self, typename Reducer, typename Device> 214 struct ScanLauncher { 215 void operator()(Self& self, typename Self::CoeffReturnType *data) { 216 Index total_size = internal::array_prod(self.dimensions()); 217 218 // We fix the index along the scan axis to 0 and perform a 219 // scan per remaining entry. The iteration is split into two nested 220 // loops to avoid an integer division by keeping track of each idx1 and idx2. 221 for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { 222 for (Index idx2 = 0; idx2 < self.stride(); idx2++) { 223 // Calculate the starting offset for the scan 224 Index offset = idx1 + idx2; 225 226 // Compute the scan along the axis, starting at the calculated offset 227 typename Self::CoeffReturnType accum = self.accumulator().initialize(); 228 for (Index idx3 = 0; idx3 < self.size(); idx3++) { 229 Index curr = offset + idx3 * self.stride(); 230 231 if (self.exclusive()) { 232 data[curr] = self.accumulator().finalize(accum); 233 self.accumulator().reduce(self.inner().coeff(curr), &accum); 234 } else { 235 self.accumulator().reduce(self.inner().coeff(curr), &accum); 236 data[curr] = self.accumulator().finalize(accum); 237 } 238 } 239 } 240 } 241 } 242 }; 243 244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 245 246 // GPU implementation of scan 247 // TODO(ibab) This placeholder implementation performs multiple scans in 248 // parallel, but it would be better to use a parallel scan algorithm and 249 // optimize memory access. 250 template <typename Self, typename Reducer> 251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { 252 // Compute offset as in the CPU version 253 Index val = threadIdx.x + blockIdx.x * blockDim.x; 254 Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); 255 256 if (offset + (self.size() - 1) * self.stride() < total_size) { 257 // Compute the scan along the axis, starting at the calculated offset 258 typename Self::CoeffReturnType accum = self.accumulator().initialize(); 259 for (Index idx = 0; idx < self.size(); idx++) { 260 Index curr = offset + idx * self.stride(); 261 if (self.exclusive()) { 262 data[curr] = self.accumulator().finalize(accum); 263 self.accumulator().reduce(self.inner().coeff(curr), &accum); 264 } else { 265 self.accumulator().reduce(self.inner().coeff(curr), &accum); 266 data[curr] = self.accumulator().finalize(accum); 267 } 268 } 269 } 270 __syncthreads(); 271 272 } 273 274 template <typename Self, typename Reducer> 275 struct ScanLauncher<Self, Reducer, GpuDevice> { 276 void operator()(const Self& self, typename Self::CoeffReturnType* data) { 277 Index total_size = internal::array_prod(self.dimensions()); 278 Index num_blocks = (total_size / self.size() + 63) / 64; 279 Index block_size = 64; 280 LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); 281 } 282 }; 283 #endif // EIGEN_USE_GPU && __CUDACC__ 284 285 } // end namespace Eigen 286 287 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 288