1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Mehdi Goli Codeplay Software Ltd. 5 // Ralph Potter Codeplay Software Ltd. 6 // Luke Iwanski Codeplay Software Ltd. 7 // Contact: <eigen (at) codeplay.com> 8 // 9 // This Source Code Form is subject to the terms of the Mozilla 10 // Public License v. 2.0. If a copy of the MPL was not distributed 11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 12 13 /***************************************************************** 14 * TensorSyclPlaceHolderExpr.h 15 * 16 * \brief: 17 * This is the specialisation of the placeholder expression based on the 18 * operation type 19 * 20 *****************************************************************/ 21 22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 24 25 namespace Eigen { 26 namespace internal { 27 28 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{ 29 template<typename BufferTOut, typename BufferTIn> 30 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ 31 do { 32 auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { 33 cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, 34 cl::sycl::range<1>{std::min(length, local)}}; 35 /* Two accessors are used: one to the buffer that is being reduced, 36 * and a second to local memory, used to store intermediate data. */ 37 auto aI = 38 bufI.template get_access<cl::sycl::access::mode::read_write>(h); 39 auto aOut = 40 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h); 41 cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, 42 cl::sycl::access::target::local> 43 scratch(cl::sycl::range<1>(local), h); 44 45 /* The parallel_for invocation chosen is the variant with an nd_item 46 * parameter, since the code requires barriers for correctness. */ 47 h.parallel_for<KernelName>( 48 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { 49 size_t globalid = id.get_global(0); 50 size_t localid = id.get_local(0); 51 /* All threads collectively read from global memory into local. 52 * The barrier ensures all threads' IO is resolved before 53 * execution continues (strictly speaking, all threads within 54 * a single work-group - there is no co-ordination between 55 * work-groups, only work-items). */ 56 if (globalid < length) { 57 scratch[localid] = aI[globalid]; 58 } 59 id.barrier(cl::sycl::access::fence_space::local_space); 60 61 /* Apply the reduction operation between the current local 62 * id and the one on the other half of the vector. */ 63 if (globalid < length) { 64 int min = (length < local) ? length : local; 65 for (size_t offset = min / 2; offset > 0; offset /= 2) { 66 if (localid < offset) { 67 scratch[localid] += scratch[localid + offset]; 68 } 69 id.barrier(cl::sycl::access::fence_space::local_space); 70 } 71 /* The final result will be stored in local id 0. */ 72 if (localid == 0) { 73 aI[id.get_group(0)] = scratch[localid]; 74 if((length<=local) && globalid ==0){ 75 aOut[globalid]=scratch[localid]; 76 } 77 } 78 } 79 }); 80 }; 81 dev.m_queue.submit(f); 82 dev.m_queue.throw_asynchronous(); 83 84 /* At this point, you could queue::wait_and_throw() to ensure that 85 * errors are caught quickly. However, this would likely impact 86 * performance negatively. */ 87 length = length / local; 88 89 } while (length > 1); 90 91 92 93 } 94 95 }; 96 97 /// For now let's start with a full reducer 98 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. 99 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the 100 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as 101 // a leafNode. 102 template <typename Self, typename Op, bool Vectorizable> 103 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { 104 105 typedef typename Self::CoeffReturnType CoeffReturnType; 106 static const bool HasOptimizedImplementation = false; 107 108 static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { 109 typedef const typename Self::ChildType HostExpr; /// this is the child of reduction 110 typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; 111 auto functors = TensorSycl::internal::extractFunctors(self.impl()); 112 int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. 113 size_t inputSize =self.impl().dimensions().TotalSize(); 114 size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input 115 size_t remaining = inputSize% red_factor; 116 if(rng ==0) { 117 red_factor=1; 118 }; 119 size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; 120 size_t GRange=std::max((size_t )1, rng); 121 122 // convert global range to power of 2 for redecution 123 GRange--; 124 GRange |= GRange >> 1; 125 GRange |= GRange >> 2; 126 GRange |= GRange >> 4; 127 GRange |= GRange >> 8; 128 GRange |= GRange >> 16; 129 #if __x86_64__ || __ppc64__ || _WIN64 130 GRange |= GRange >> 32; 131 #endif 132 GRange++; 133 size_t outTileSize = tileSize; 134 /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. 135 if (GRange < outTileSize) outTileSize=GRange; 136 // getting final out buffer at the moment the created buffer is true because there is no need for assign 137 auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output); 138 /// creating the shared memory for calculating reduction. 139 /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can 140 /// recursively apply reduction on it in order to reduce the whole. 141 auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange)); 142 typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; 143 Dims dims= self.xprDims(); 144 Op functor = reducer; 145 dev.m_queue.submit([&](cl::sycl::handler &cgh) { 146 // create a tuple of accessors from Evaluator 147 auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); 148 auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); 149 150 cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { 151 typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; 152 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); 153 /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour 154 /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the 155 /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. 156 const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); 157 /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is 158 /// the device_evaluator is detectable and recognisable on the device. 159 auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); 160 /// const cast added as a naive solution to solve the qualifier drop error 161 auto globalid=itemID.get_global_linear_id(); 162 163 if(globalid<rng) 164 tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor)); 165 else 166 tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0); 167 168 if(remaining!=0 && globalid==0 ) 169 // this will add the rest of input buffer when the input size is not devidable to red_factor. 170 tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor)); 171 }); 172 }); 173 dev.m_queue.throw_asynchronous(); 174 175 /// This is used to recursively reduce the tmp value to an element of 1; 176 syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); 177 } 178 179 }; 180 181 template <typename Self, typename Op> 182 struct InnerReducer<Self, Op, const Eigen::SyclDevice> { 183 184 typedef typename Self::CoeffReturnType CoeffReturnType; 185 static const bool HasOptimizedImplementation = false; 186 187 static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { 188 typedef const typename Self::ChildType HostExpr; /// this is the child of reduction 189 typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; 190 auto functors = TensorSycl::internal::extractFunctors(self.impl()); 191 192 size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; 193 194 size_t GRange=num_coeffs_to_preserve; 195 if (tileSize>GRange) tileSize=GRange; 196 else if(GRange>tileSize){ 197 size_t xMode = GRange % tileSize; 198 if (xMode != 0) GRange += (tileSize - xMode); 199 } 200 // getting final out buffer at the moment the created buffer is true because there is no need for assign 201 /// creating the shared memory for calculating reduction. 202 /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can 203 /// recursively apply reduction on it in order to reduce the whole. 204 typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; 205 Dims dims= self.xprDims(); 206 Op functor = reducer; 207 208 dev.m_queue.submit([&](cl::sycl::handler &cgh) { 209 // create a tuple of accessors from Evaluator 210 auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); 211 auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output); 212 213 cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { 214 typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; 215 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); 216 /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour 217 /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the 218 /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. 219 const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); 220 /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is 221 /// the device_evaluator is detectable and recognisable on the device. 222 typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf; 223 auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); 224 /// const cast added as a naive solution to solve the qualifier drop error 225 auto globalid=itemID.get_global_linear_id(); 226 if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) { 227 typename DeiceSelf::CoeffReturnType accum = functor.initialize(); 228 GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum); 229 functor.finalize(accum); 230 output_accessor.get_pointer()[globalid]= accum; 231 } 232 }); 233 }); 234 dev.m_queue.throw_asynchronous(); 235 return false; 236 } 237 }; 238 239 } // end namespace internal 240 } // namespace Eigen 241 242 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 243