Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // pack.h: packing blocks of the LHS and RHS into the data layout
     16 // that is expected by compute.h and eventually by kernels.
     17 // Because this data layout depends on the kernel format, code here
     18 // is templated in KernelLhsFormat/KernelRhsFormat.
     19 //
     20 // Readers note: an important theme around here is that we try hard
     21 // to handle both Lhs and Rhs with a single piece of code. We indifferently
     22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
     23 // by (row, column) indices, we address them by (width, depth), as explained
     24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
     25 // at once.
     26 
     27 #ifndef GEMMLOWP_INTERNAL_PACK_H_
     28 #define GEMMLOWP_INTERNAL_PACK_H_
     29 
     30 #include <cstring>
     31 
     32 #include "allocator.h"
     33 #include "block_params.h"
     34 #include "common.h"
     35 #include "kernel.h"
     36 
     37 namespace gemmlowp {
     38 
     39 // A PackedSideBlock instance is a packed block of either the LHS or RHS
     40 // (whence the generic 'Side' name).
     41 //
     42 // 'Packed' means that it is laid out in the storage order that
     43 // is expected by the specified kernel format. From a block of the input
     44 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
     45 // or PackRhs().
     46 template <typename tKernelSideFormat>
     47 class PackedSideBlock {
     48  public:
     49   typedef tKernelSideFormat KernelSideFormat;
     50 
     51   PackedSideBlock(Side side, Allocator* allocator,
     52                   const BlockParams& block_params)
     53       : allocator_(allocator), pos_(0) {
     54     GetSideBlockParams(side, &params_, block_params);
     55     data_handle_ =
     56         allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
     57     sums_of_each_slice_handle_ =
     58         allocator_->Reserve<std::int32_t>(params_.l2_width);
     59   }
     60 
     61   ~PackedSideBlock() {}
     62 
     63   void seek_run(int start_width, int start_depth) const {
     64     int kernel_run_depth =
     65         std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
     66     pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
     67   }
     68 
     69   void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
     70 
     71   void seek_forward_n_cells(int n) const {
     72     pos_ += n * KernelSideFormat::Cell::kSize;
     73   }
     74 
     75   const std::uint8_t* current_data() const {
     76     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
     77   }
     78 
     79   std::uint8_t* current_data() {
     80     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
     81   }
     82 
     83   std::int32_t* sums_of_each_slice() {
     84     return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
     85   }
     86 
     87   const std::int32_t* sums_of_each_slice() const {
     88     return allocator_->GetPointer<const std::int32_t>(
     89         sums_of_each_slice_handle_);
     90   }
     91 
     92   const SideBlockParams& params() const { return params_; }
     93 
     94  private:
     95   // The block size parameters that this PackedSizeBlock follows.
     96   // The L2 parameters determine its overall size, while the L1 parameters,
     97   // together with the kernel format template parameter, determine
     98   // the fine details of the storage/traversal order.
     99   SideBlockParams params_;
    100 
    101   // Pointer to the allocator provided by the caller. Not owned.
    102   // The Allocator is assumed to outlive the PackedSideBlock.
    103   Allocator* const allocator_;
    104 
    105   // Handle on the buffer backing this packed block. Owned.
    106   Allocator::Handle data_handle_;
    107 
    108   // Handle on the additional buffer backing the vector of sums of slices
    109   // associated with this block. Owned.
    110   Allocator::Handle sums_of_each_slice_handle_;
    111 
    112   // pos_ is the current position in the buffer, which we access
    113   // sequentially, like a file.
    114   // The idea is that we pack data in the same order as it is
    115   // going to be traversed during the computation, which for
    116   // cache-friendliness reasons is complicated to random-access,
    117   // as the offsets calculations would be intricate. So we
    118   // give up random-access addressing, and instead content ourselves
    119   // with sequential access.
    120   //
    121   // pos_ is mutable because during the computation we will want to
    122   // be able to iterate on the data in a const PackedSideBlock.
    123   mutable int pos_;
    124 };
    125 
    126 // WidthMajor and DepthMajor are custom phrases modelled after the
    127 // standard terminology 'row-major' and 'column-major'. Their meaning
    128 // should be transparent once one has read the explanation in kernel.h:
    129 // for example, in the Lhs, the 'width' dimension is the rows dimension,
    130 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
    131 // Another way to put it: WidthMajor means that contiguous storage is used
    132 // for entries having the same 'width' index.
    133 enum class SideMapOrder { WidthMajor, DepthMajor };
    134 
    135 // Similar to MatrixMap from map.h, but in terms of width/depth instead of
    136 // rows/columns. Used to address blocks of the input LHS/RHS matrices when
    137 // packing them.
    138 template <typename tScalar, SideMapOrder tOrder>
    139 class SideMap {
    140  public:
    141   typedef tScalar Scalar;
    142   static const SideMapOrder kOrder = tOrder;
    143 
    144   SideMap(Scalar* data, int width, int depth, int stride)
    145       : data_(data), width_(width), depth_(depth), stride_(stride) {}
    146 
    147   SideMap(Scalar* data, int width, int depth)
    148       : data_(data), width_(width), depth_(depth) {
    149     stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
    150   }
    151 
    152   SideMap(const SideMap& other)
    153       : data_(other.data_),
    154         width_(other.width_),
    155         depth_(other.depth_),
    156         stride_(other.stride_) {}
    157 
    158   int width() const { return width_; }
    159   int depth() const { return depth_; }
    160   int stride() const { return stride_; }
    161   int width_stride() const {
    162     return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
    163   }
    164   int depth_stride() const {
    165     return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
    166   }
    167   Scalar* data() const { return data_; }
    168   Scalar* data(int w, int d) const {
    169     return data_ + w * width_stride() + d * depth_stride();
    170   }
    171   Scalar operator()(int w, int d) const { return *data(w, d); }
    172   Scalar& operator()(int w, int d) { return *data(w, d); }
    173 
    174   SideMap block(int start_width, int start_depth, int block_width,
    175                 int block_depth) const {
    176     assert(start_width >= 0);
    177     assert(start_width + block_width <= width_);
    178     assert(start_depth >= 0);
    179     assert(start_depth + block_depth <= depth_);
    180 
    181     return SideMap(data(start_width, start_depth), block_width, block_depth,
    182                    stride_);
    183   }
    184 
    185  private:
    186   Scalar* data_;  // not owned.
    187   int width_, depth_, stride_;
    188 };
    189 
    190 // A PackingRegisterBlock is a small fixed-size block of a matrix being
    191 // packed. This class is the generic non-optimized implementation,
    192 // it is inherited by the generic implementation of PackingRegisterBlock,
    193 // which may be overriden by template specialization. Overriding it is how
    194 // one may provide optimized packing code paths.
    195 //
    196 // The packing of a block proceeds in two steps:
    197 //   1. Ensuring that we have a complete block of source data, i.e. a block of
    198 //      the compile-time prescribed size. This is where we handle unaligned
    199 //      boundaries: if we don't have a complete block of source data, then
    200 //      we copy and zero-extend it into a local temporary (complete_src_),
    201 //      see MakeCompleteSrc. In the generic case, we do have a complete block,
    202 //      so we just use it in-place, see UseCompleteSrcInPlace.
    203 //   2. Packing a complete block into the destination, see Pack. This is the
    204 //      most critical part, so it's convenient that unaligned boundaries have
    205 //      already been handled in step 1.
    206 template <typename SrcMapType, typename PackedSideBlock>
    207 class PackingRegisterBlockBase {
    208  public:
    209   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
    210   typedef typename KernelSideFormat::Cell CellFormat;
    211   typedef typename KernelSideFormat::Scalar KernelScalar;
    212   static const int kCells = KernelSideFormat::kCells;
    213   static const int kCellWidth = CellFormat::kWidth;
    214   static const int kKernelWidth = CellFormat::kWidth * kCells;
    215   static const int kCellDepth = CellFormat::kDepth;
    216   static const int kCellSize = CellFormat::kSize;
    217   static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
    218   static const int kZeroPointInputValue =
    219       ZeroPointInputValue<KernelScalar>::kValue;
    220 
    221   PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
    222 
    223  protected:
    224   // The source data that's ready for packing. May point to
    225   // in-place actual source data if it's already a complete block,
    226   // (see UseCompleteSrcInPlace)
    227   // or to the local buf_ below into which we copy incomplete blocks
    228   // (see MakeCompleteSrc)
    229   SrcMapType complete_src_;
    230 
    231   // Temporary buffer for loading incomplete blocks to,
    232   // in the source storage order
    233   std::uint8_t buf_[kKernelWidth * kRegisterSize];
    234 
    235  public:
    236   // Selects a block if in-place source data that's already a complete block
    237   void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
    238   // Copies an incomplete block of source data into a local temporary
    239   // complete block by zero-extending it.
    240   void MakeCompleteSrc(const SrcMapType& src) {
    241     memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
    242     if (kSrcOrder == SideMapOrder::WidthMajor) {
    243       for (int w = 0; w < src.width(); w++) {
    244         memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
    245       }
    246     } else {
    247       assert(kSrcOrder == SideMapOrder::DepthMajor);
    248       for (int d = 0; d < src.depth(); d++) {
    249         memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
    250       }
    251     }
    252     complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
    253   }
    254   // Packs a complete block into the destination. This is the most
    255   // critical part and the part that we most typically want to
    256   // override in architecture-specific optimized specializations.
    257   void Pack(PackedSideBlock* dst, int start_width) {
    258     std::uint8_t* dst_ptr = dst->current_data();
    259     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
    260          cell_start_depth += kCellDepth) {
    261       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
    262            cell_start_width += kCellWidth) {
    263         std::int32_t* cell_sums_of_each_slice_ptr =
    264             dst->sums_of_each_slice() + start_width + cell_start_width;
    265         const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
    266             complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
    267                                 kCellDepth));
    268         for (int w = 0; w < kCellWidth; w++) {
    269           std::int32_t sum = 0;
    270           for (int d = 0; d < kCellDepth; d++) {
    271             const std::uint8_t src_val = src_cell_map(w, d);
    272             const std::int16_t kernel_val_unwrapped =
    273                 src_val - kZeroPointInputValue;
    274             const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
    275             dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
    276             sum += kernel_val_unwrapped;
    277           }
    278           cell_sums_of_each_slice_ptr[w] += sum;
    279         }
    280         dst_ptr += kCellSize;
    281       }
    282     }
    283     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
    284   }
    285 };
    286 
    287 template <typename SrcMapType, typename PackedSideBlock>
    288 class PackingRegisterBlock
    289     : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
    290 
    291 // Large-scale implementation of packing.
    292 template <typename SrcMapType, typename PackedSideBlock>
    293 class PackSideBlockImpl {
    294  public:
    295   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
    296   typedef typename KernelSideFormat::Cell CellFormat;
    297   static const int kCells = KernelSideFormat::kCells;
    298   static const int kCellWidth = CellFormat::kWidth;
    299   static const int kKernelWidth = CellFormat::kWidth * kCells;
    300   static const int kCellDepth = CellFormat::kDepth;
    301 
    302   typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
    303       PackingRegisterBlockType;
    304 
    305   PackSideBlockImpl(PackedSideBlock* packed_side_block,
    306                     const SrcMapType& src_map)
    307       : packed_side_block_(packed_side_block), src_map_(src_map) {}
    308 
    309   PackedSideBlock* packed_side_block() const { return packed_side_block_; }
    310 
    311   const SrcMapType& src_map() const { return src_map_; }
    312 
    313   // The public entry point to pack a block.
    314   void PackL2() {
    315     memset(packed_side_block_->sums_of_each_slice(), 0,
    316            sizeof(std::int32_t) * packed_side_block_->params().l2_width);
    317     for (int d = 0; d < src_map_.depth();
    318          d += packed_side_block_->params().l1_depth) {
    319       int ds = std::min<int>(packed_side_block_->params().l1_depth,
    320                              src_map_.depth() - d);
    321 
    322       for (int w = 0; w < src_map_.width();
    323            w += packed_side_block_->params().l1_width) {
    324         int ws = std::min<int>(packed_side_block_->params().l1_width,
    325                                src_map_.width() - w);
    326 
    327         PrefetchL1(w, ws, d, ds);
    328         PackL1(w, ws, d, ds);
    329       }
    330     }
    331   }
    332 
    333  protected:
    334   // The intermediate-level loops, between PackL2 and PackRun.
    335   void PackL1(int start_width, int width, int start_depth, int depth) {
    336     for (int w = 0; w < width; w += kKernelWidth) {
    337       int ws = std::min(+kKernelWidth, width - w);
    338       packed_side_block_->seek_run(start_width + w, start_depth);
    339       PackRun(start_width + w, ws, start_depth, depth);
    340     }
    341   }
    342 
    343   // Prefetches the data that will be read by PackL1
    344   void PrefetchL1(int start_width, int width, int start_depth, int depth) {
    345     if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
    346       for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
    347         for (int w = 0; w < width; w += 1) {
    348           Prefetch(src_map_.data(start_width + w, start_depth + d));
    349         }
    350       }
    351     } else {
    352       for (int d = 0; d < depth; d++) {
    353         for (int w = 0; w < width; w += kDefaultCacheLineSize) {
    354           Prefetch(src_map_.data(start_width + w, start_depth + d));
    355         }
    356       }
    357     }
    358   }
    359 
    360   // PackRun packs only a run i.e. is the inner loop in the depth dimension.
    361   void PackRun(int start_width, int width, int start_depth, int depth) {
    362     PackingRegisterBlockType b;
    363     if (width == kKernelWidth) {
    364       const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
    365       if (register_aligned_depth) {
    366         for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
    367           b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
    368                                                  width, kRegisterSize));
    369           b.Pack(packed_side_block_, start_width);
    370         }
    371       }
    372       if (register_aligned_depth < depth) {
    373         b.MakeCompleteSrc(
    374             src_map_.block(start_width, start_depth + register_aligned_depth,
    375                            width, depth - register_aligned_depth));
    376         b.Pack(packed_side_block_, start_width);
    377       }
    378     } else {
    379       assert(width < kKernelWidth);
    380       for (int d = 0; d < depth; d += kRegisterSize) {
    381         const int ds = std::min(+kRegisterSize, depth - d);
    382         b.MakeCompleteSrc(
    383             src_map_.block(start_width, start_depth + d, width, ds));
    384         b.Pack(packed_side_block_, start_width);
    385       }
    386     }
    387   }
    388 
    389   // The PackedSideBlock being packed, i.e. the 'destination'.
    390   PackedSideBlock* const packed_side_block_;
    391 
    392   // A map on the block of the original matrix block being packed,
    393   // i.e. the 'source'.
    394   const SrcMapType& src_map_;
    395 };
    396 
    397 // Packs a block of the input LHS matrix, into a PackedSideBlock
    398 template <typename PackedSideBlock, typename MatrixMapType>
    399 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
    400   ScopedProfilingLabel label("pack LHS");
    401   static const SideMapOrder kSideMapOrder =
    402       MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
    403                                                   : SideMapOrder::DepthMajor;
    404   typedef typename MatrixMapType::Scalar Scalar;
    405   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
    406   SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
    407   typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
    408   ImplType impl(dst, src_side_map);
    409   impl.PackL2();
    410 }
    411 
    412 // Packs a block of the input RHS matrix, into a PackedSideBlock
    413 template <typename PackedSideBlock, typename MatrixMapType>
    414 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
    415   ScopedProfilingLabel label("pack RHS");
    416   static const SideMapOrder kSideMapOrder =
    417       MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
    418                                                   : SideMapOrder::DepthMajor;
    419   typedef typename MatrixMapType::Scalar Scalar;
    420   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
    421   SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
    422   typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
    423   ImplType impl(dst, src_side_map);
    424   impl.PackL2();
    425 }
    426 
    427 }  // namespace gemmlowp
    428 
    429 #ifdef GEMMLOWP_NEON
    430 #include "pack_neon.h"
    431 #elif defined(GEMMLOWP_SSE4)
    432 #include "pack_sse.h"
    433 #elif defined(GEMMLOWP_MSA)
    434 #include "pack_msa.h"
    435 #endif
    436 
    437 #endif  // GEMMLOWP_INTERNAL_PACK_H_
    438