Home | History | Annotate | Download | only in Tensor
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Mehdi Goli    Codeplay Software Ltd.
      5 // Ralph Potter  Codeplay Software Ltd.
      6 // Luke Iwanski  Codeplay Software Ltd.
      7 // Contact: <eigen (at) codeplay.com>
      8 //
      9 // This Source Code Form is subject to the terms of the Mozilla
     10 // Public License v. 2.0. If a copy of the MPL was not distributed
     11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     12 
     13 /*****************************************************************
     14  * TensorSyclPlaceHolderExpr.h
     15  *
     16  * \brief:
     17  *  This is the specialisation of the placeholder expression based on the
     18  * operation type
     19  *
     20 *****************************************************************/
     21 
     22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
     23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
     24 
     25 namespace Eigen {
     26 namespace internal {
     27 
     28 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
     29 template<typename BufferTOut, typename BufferTIn>
     30 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
     31   do {
     32           auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
     33             cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
     34                                     cl::sycl::range<1>{std::min(length, local)}};
     35             /* Two accessors are used: one to the buffer that is being reduced,
     36              * and a second to local memory, used to store intermediate data. */
     37             auto aI =
     38                 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
     39             auto aOut =
     40                 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
     41             cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
     42                                cl::sycl::access::target::local>
     43                 scratch(cl::sycl::range<1>(local), h);
     44 
     45             /* The parallel_for invocation chosen is the variant with an nd_item
     46              * parameter, since the code requires barriers for correctness. */
     47             h.parallel_for<KernelName>(
     48                 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
     49                   size_t globalid = id.get_global(0);
     50                   size_t localid = id.get_local(0);
     51                   /* All threads collectively read from global memory into local.
     52                    * The barrier ensures all threads' IO is resolved before
     53                    * execution continues (strictly speaking, all threads within
     54                    * a single work-group - there is no co-ordination between
     55                    * work-groups, only work-items). */
     56                   if (globalid < length) {
     57                     scratch[localid] = aI[globalid];
     58                   }
     59                   id.barrier(cl::sycl::access::fence_space::local_space);
     60 
     61                   /* Apply the reduction operation between the current local
     62                    * id and the one on the other half of the vector. */
     63                   if (globalid < length) {
     64                     int min = (length < local) ? length : local;
     65                     for (size_t offset = min / 2; offset > 0; offset /= 2) {
     66                       if (localid < offset) {
     67                         scratch[localid] += scratch[localid + offset];
     68                       }
     69                       id.barrier(cl::sycl::access::fence_space::local_space);
     70                     }
     71                     /* The final result will be stored in local id 0. */
     72                     if (localid == 0) {
     73                       aI[id.get_group(0)] = scratch[localid];
     74                       if((length<=local) && globalid ==0){
     75                         aOut[globalid]=scratch[localid];
     76                       }
     77                     }
     78                   }
     79                 });
     80           };
     81             dev.m_queue.submit(f);
     82             dev.m_queue.throw_asynchronous();
     83 
     84           /* At this point, you could queue::wait_and_throw() to ensure that
     85            * errors are caught quickly. However, this would likely impact
     86            * performance negatively. */
     87           length = length / local;
     88 
     89         } while (length > 1);
     90 
     91 
     92 
     93 }
     94 
     95 };
     96 
     97 /// For now let's start with a full reducer
     98 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
     99 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
    100 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
    101 // a leafNode.
    102 template <typename Self, typename Op, bool Vectorizable>
    103 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
    104 
    105   typedef typename Self::CoeffReturnType CoeffReturnType;
    106   static const bool HasOptimizedImplementation = false;
    107 
    108   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
    109     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
    110     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
    111     auto functors = TensorSycl::internal::extractFunctors(self.impl());
    112     int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
    113     size_t inputSize =self.impl().dimensions().TotalSize();
    114     size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
    115     size_t remaining = inputSize% red_factor;
    116     if(rng ==0) {
    117       red_factor=1;
    118     };
    119     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
    120     size_t GRange=std::max((size_t )1, rng);
    121 
    122     // convert global range to power of 2 for redecution
    123     GRange--;
    124     GRange |= GRange >> 1;
    125     GRange |= GRange >> 2;
    126     GRange |= GRange >> 4;
    127     GRange |= GRange >> 8;
    128     GRange |= GRange >> 16;
    129 #if __x86_64__ || __ppc64__ || _WIN64
    130     GRange |= GRange >> 32;
    131 #endif
    132     GRange++;
    133     size_t  outTileSize = tileSize;
    134     /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
    135     if (GRange < outTileSize) outTileSize=GRange;
    136     // getting final out buffer at the moment the created buffer is true because there is no need for assign
    137     auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
    138     /// creating the shared memory for calculating reduction.
    139     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
    140     /// recursively apply reduction on it in order to reduce the whole.
    141     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
    142     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
    143     Dims dims= self.xprDims();
    144     Op functor = reducer;
    145     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
    146       // create a tuple of accessors from Evaluator
    147       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
    148       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
    149 
    150       cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
    151         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    152         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    153         /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    154         /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    155         /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    156         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
    157         /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    158         /// the device_evaluator is detectable and recognisable on the device.
    159         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
    160         /// const cast added as a naive solution to solve the qualifier drop error
    161         auto globalid=itemID.get_global_linear_id();
    162 
    163         if(globalid<rng)
    164           tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
    165         else
    166           tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
    167 
    168         if(remaining!=0 && globalid==0 )
    169           // this will add the rest of input buffer when the input size is not devidable to red_factor.
    170           tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
    171       });
    172     });
    173   dev.m_queue.throw_asynchronous();
    174 
    175 /// This is used to recursively reduce the tmp value to an element of 1;
    176   syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
    177   }
    178 
    179 };
    180 
    181 template <typename Self, typename Op>
    182 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
    183 
    184   typedef typename Self::CoeffReturnType CoeffReturnType;
    185   static const bool HasOptimizedImplementation = false;
    186 
    187   static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
    188     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
    189     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
    190     auto functors = TensorSycl::internal::extractFunctors(self.impl());
    191 
    192     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
    193 
    194     size_t GRange=num_coeffs_to_preserve;
    195     if (tileSize>GRange) tileSize=GRange;
    196     else if(GRange>tileSize){
    197       size_t xMode = GRange % tileSize;
    198       if (xMode != 0) GRange += (tileSize - xMode);
    199     }
    200     // getting final out buffer at the moment the created buffer is true because there is no need for assign
    201     /// creating the shared memory for calculating reduction.
    202     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
    203     /// recursively apply reduction on it in order to reduce the whole.
    204     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
    205     Dims dims= self.xprDims();
    206     Op functor = reducer;
    207 
    208     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
    209       // create a tuple of accessors from Evaluator
    210       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
    211       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
    212 
    213       cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
    214         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    215         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    216         /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    217         /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    218         /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    219         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
    220         /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    221         /// the device_evaluator is detectable and recognisable on the device.
    222         typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
    223         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
    224         /// const cast added as a naive solution to solve the qualifier drop error
    225         auto globalid=itemID.get_global_linear_id();
    226         if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
    227           typename DeiceSelf::CoeffReturnType accum = functor.initialize();
    228           GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
    229           functor.finalize(accum);
    230           output_accessor.get_pointer()[globalid]= accum;
    231         }
    232       });
    233     });
    234   dev.m_queue.throw_asynchronous();
    235     return false;
    236   }
    237 };
    238 
    239 }  // end namespace internal
    240 }  // namespace Eigen
    241 
    242 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
    243