Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // pack.h: packing blocks of the LHS and RHS into the data layout
     16 // that is expected by compute.h and eventually by kernels.
     17 // Because this data layout depends on the kernel format, code here
     18 // is templated in KernelLhsFormat/KernelRhsFormat.
     19 //
     20 // Readers note: an important theme around here is that we try hard
     21 // to handle both Lhs and Rhs with a single piece of code. We indifferently
     22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
     23 // by (row, column) indices, we address them by (width, depth), as explained
     24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
     25 // at once.
     26 
     27 #ifndef GEMMLOWP_INTERNAL_PACK_H_
     28 #define GEMMLOWP_INTERNAL_PACK_H_
     29 
     30 #include <cstring>
     31 
     32 #include "../public/bit_depth.h"
     33 #include "allocator.h"
     34 #include "block_params.h"
     35 #include "common.h"
     36 #include "kernel.h"
     37 
     38 namespace gemmlowp {
     39 
     40 // A PackedSideBlock instance is a packed block of either the LHS or RHS
     41 // (whence the generic 'Side' name).
     42 //
     43 // 'Packed' means that it is laid out in the storage order that
     44 // is expected by the specified kernel format. From a block of the input
     45 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
     46 // or PackRhs().
     47 template <typename tKernelSideFormat>
     48 class PackedSideBlock {
     49  public:
     50   typedef tKernelSideFormat KernelSideFormat;
     51 
     52   PackedSideBlock(Side side, Allocator* allocator,
     53                   const BlockParams& block_params)
     54       : allocator_(allocator),
     55         pos_(0) {
     56     GetSideBlockParams(side, &params_, block_params);
     57     data_handle_ =
     58         allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
     59     sums_of_each_slice_handle_ =
     60         allocator_->Reserve<std::int32_t>(params_.l2_width);
     61   }
     62 
     63   ~PackedSideBlock() {}
     64 
     65   void seek_run(int start_width, int start_depth) const {
     66     int kernel_run_depth =
     67         std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
     68     pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
     69   }
     70 
     71   void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
     72 
     73   void seek_forward_n_cells(int n) const {
     74     pos_ += n * KernelSideFormat::Cell::kSize;
     75   }
     76 
     77   const std::uint8_t* current_data() const {
     78     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
     79   }
     80 
     81   std::uint8_t* current_data() {
     82     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
     83   }
     84 
     85   std::int32_t* sums_of_each_slice() {
     86     return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
     87   }
     88 
     89   const std::int32_t* sums_of_each_slice() const {
     90     return allocator_->GetPointer<const std::int32_t>(
     91         sums_of_each_slice_handle_);
     92   }
     93 
     94   const SideBlockParams& params() const { return params_; }
     95 
     96  private:
     97   // The block size parameters that this PackedSizeBlock follows.
     98   // The L2 parameters determine its overall size, while the L1 parameters,
     99   // together with the kernel format template parameter, determine
    100   // the fine details of the storage/traversal order.
    101   SideBlockParams params_;
    102 
    103   // Pointer to the allocator provided by the caller. Not owned.
    104   // The Allocator is assumed to outlive the PackedSideBlock.
    105   Allocator* const allocator_;
    106 
    107   // Handle on the buffer backing this packed block. Owned.
    108   Allocator::Handle data_handle_;
    109 
    110   // Handle on the additional buffer backing the vector of sums of slices
    111   // associated with this block. Owned.
    112   Allocator::Handle sums_of_each_slice_handle_;
    113 
    114   // pos_ is the current position in the buffer, which we access
    115   // sequentially, like a file.
    116   // The idea is that we pack data in the same order as it is
    117   // going to be traversed during the computation, which for
    118   // cache-friendliness reasons is complicated to random-access,
    119   // as the offsets calculations would be intricate. So we
    120   // give up random-access addressing, and instead content ourselves
    121   // with sequential access.
    122   //
    123   // pos_ is mutable because during the computation we will want to
    124   // be able to iterate on the data in a const PackedSideBlock.
    125   mutable int pos_;
    126 };
    127 
    128 // WidthMajor and DepthMajor are custom phrases modelled after the
    129 // standard terminology 'row-major' and 'column-major'. Their meaning
    130 // should be transparent once one has read the explanation in kernel.h:
    131 // for example, in the Lhs, the 'width' dimension is the rows dimension,
    132 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
    133 // Another way to put it: WidthMajor means that contiguous storage is used
    134 // for entries having the same 'width' index.
    135 enum class SideMapOrder { WidthMajor, DepthMajor };
    136 
    137 // Similar to MatrixMap from map.h, but in terms of width/depth instead of
    138 // rows/columns. Used to address blocks of the input LHS/RHS matrices when
    139 // packing them.
    140 template <typename tScalar, SideMapOrder tOrder>
    141 class SideMap {
    142  public:
    143   typedef tScalar Scalar;
    144   static const SideMapOrder kOrder = tOrder;
    145 
    146   SideMap(Scalar* data, int width, int depth, int stride)
    147       : data_(data), width_(width), depth_(depth), stride_(stride) {}
    148 
    149   SideMap(Scalar* data, int width, int depth)
    150       : data_(data), width_(width), depth_(depth) {
    151     stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
    152   }
    153 
    154   SideMap(const SideMap& other)
    155       : data_(other.data_),
    156         width_(other.width_),
    157         depth_(other.depth_),
    158         stride_(other.stride_) {}
    159 
    160   int width() const { return width_; }
    161   int depth() const { return depth_; }
    162   int stride() const { return stride_; }
    163   int width_stride() const {
    164     return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
    165   }
    166   int depth_stride() const {
    167     return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
    168   }
    169   Scalar* data() const { return data_; }
    170   Scalar* data(int w, int d) const {
    171     return data_ + w * width_stride() + d * depth_stride();
    172   }
    173   Scalar operator()(int w, int d) const { return *data(w, d); }
    174   Scalar& operator()(int w, int d) { return *data(w, d); }
    175 
    176   SideMap block(int start_width, int start_depth, int block_width,
    177                 int block_depth) const {
    178     assert(start_width >= 0);
    179     assert(start_width + block_width <= width_);
    180     assert(start_depth >= 0);
    181     assert(start_depth + block_depth <= depth_);
    182 
    183     return SideMap(data(start_width, start_depth), block_width, block_depth,
    184                    stride_);
    185   }
    186 
    187  private:
    188   Scalar* data_;  // not owned.
    189   int width_, depth_, stride_;
    190 };
    191 
    192 template <RoundingMode tRoundingMode>
    193 class ScalarRoundingOffsetGenerator {
    194  public:
    195   std::uint8_t get() {
    196     assert(false);  // This generic path should never be called.
    197     return 0;
    198   }
    199 };
    200 
    201 // A RoundingOffsetGenerator for rounding-to-nearest, always returning
    202 // the midpoint value 127.
    203 template <>
    204 class ScalarRoundingOffsetGenerator<RoundingMode::Nearest> {
    205  public:
    206   std::uint8_t get() { return 127; }
    207 };
    208 
    209 // A RoundingOffsetGenerator based on a 8-bit Xorshift.
    210 // This gives good results as Xorshift naturally generates
    211 // uniform random *nonzero* bytes i.e. 255 different values,
    212 // so it only remains for us to subtract one.
    213 template <>
    214 class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> {
    215  public:
    216   ScalarRoundingOffsetGenerator() { x_ = 128; }
    217 
    218   std::uint8_t get() {
    219     std::uint8_t result = x_ - 1;
    220     // Xorshift8(7,5,3)
    221     x_ ^= x_ << 7;
    222     x_ ^= x_ >> 5;
    223     x_ ^= x_ << 3;
    224     return result;
    225   }
    226 
    227  private:
    228   // State
    229   std::uint8_t x_;
    230 };
    231 
    232 // A RoundingOffsetGenerator based on an 8-bit add/mod
    233 // low-discrepancy sequence.  See less-than-8-bit.txt for
    234 // an explanation (the constant 97 is important - it must
    235 // be both relatively prime to 255, in order for the sequence
    236 // to be full-period, and c/255 should be close to 0.38 to
    237 // obtain low discrepancy).  Uses a small bit hack to avoid
    238 // expensive % operations.
    239 template <>
    240 class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> {
    241   static const uint8_t AddConst = 97;
    242 
    243  public:
    244   ScalarRoundingOffsetGenerator() { x_ = 1; }  // Start must be non-zero
    245 
    246   std::uint8_t get() {
    247     // The +'d boolean term causes the increment to skip over 255,
    248     // (recalling that 255+1 = 256 = 0 for an 8 bit uint),
    249     // thus implementing %255
    250     x_ += (AddConst + (x_ >= (255 - AddConst)));
    251     return x_;
    252   }
    253 
    254  private:
    255   // State
    256   std::uint8_t x_;
    257 };
    258 
    259 // Requantizes a source uint8 value in [0..255] range
    260 // to the range specified by BitDepth, [0..((2^bits)-1)].
    261 // Bias must be avoided. Currently this is achieved
    262 // by probabilistic rounding.
    263 template <typename QuantizationParams>
    264 std::uint8_t Requantize(
    265     std::uint8_t raw_src_val,
    266     ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>*
    267         rounding_offset_generator) {
    268   static const int kBits = QuantizationParams::BitDepth::kBits;
    269   static const std::uint8_t kMaxVal = (1 << kBits) - 1;
    270 
    271   if (kBits == 8) {
    272     return raw_src_val;
    273   }
    274 
    275   std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal;
    276   std::uint8_t rounding_offset = rounding_offset_generator->get();
    277   return (scaled + rounding_offset) / 255;
    278 }
    279 
    280 // A PackingRegisterBlock is a small fixed-size block of a matrix being
    281 // packed. This class is the generic non-optimized implementation,
    282 // it is inherited by the generic implementation of PackingRegisterBlock,
    283 // which may be overriden by template specialization. Overriding it is how
    284 // one may provide optimized packing code paths.
    285 //
    286 // The packing of a block proceeds in two steps:
    287 //   1. Ensuring that we have a complete block of source data, i.e. a block of
    288 //      the compile-time prescribed size. This is where we handle unaligned
    289 //      boundaries: if we don't have a complete block of source data, then
    290 //      we copy and zero-extend it into a local temporary (complete_src_),
    291 //      see MakeCompleteSrc. In the generic case, we do have a complete block,
    292 //      so we just use it in-place, see UseCompleteSrcInPlace.
    293 //   2. Packing a complete block into the destination, see Pack. This is the
    294 //      most critical part, so it's convenient that unaligned boundaries have
    295 //      already been handled in step 1.
    296 template <typename QuantizationParams, typename SrcMapType,
    297           typename PackedSideBlock>
    298 class PackingRegisterBlockBase {
    299  public:
    300   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
    301   typedef typename KernelSideFormat::Cell CellFormat;
    302   static const int kCells = KernelSideFormat::kCells;
    303   static const int kCellWidth = CellFormat::kWidth;
    304   static const int kKernelWidth = CellFormat::kWidth * kCells;
    305   static const int kCellDepth = CellFormat::kDepth;
    306   static const int kCellSize = CellFormat::kSize;
    307   static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
    308 
    309   typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>
    310       RoundingOffsetGenerator;
    311 
    312   PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
    313 
    314  protected:
    315   // The source data that's ready for packing. May point to
    316   // in-place actual source data if it's already a complete block,
    317   // (see UseCompleteSrcInPlace)
    318   // or to the local buf_ below into which we copy incomplete blocks
    319   // (see MakeCompleteSrc)
    320   SrcMapType complete_src_;
    321 
    322   // Temporary buffer for loading incomplete blocks to,
    323   // in the source storage order
    324   std::uint8_t buf_[kKernelWidth * kRegisterSize];
    325 
    326  public:
    327   // Selects a block if in-place source data that's already a complete block
    328   void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
    329   // Copies an incomplete block of source data into a local temporary
    330   // complete block by zero-extending it.
    331   void MakeCompleteSrc(const SrcMapType& src) {
    332     memset(buf_, 0, kKernelWidth * kRegisterSize);
    333     if (kSrcOrder == SideMapOrder::WidthMajor) {
    334       for (int w = 0; w < src.width(); w++) {
    335         memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
    336       }
    337     } else {
    338       assert(kSrcOrder == SideMapOrder::DepthMajor);
    339       for (int d = 0; d < src.depth(); d++) {
    340         memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
    341       }
    342     }
    343     complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
    344   }
    345   // Packs a complete block into the destination. This is the most
    346   // critical part and the part that we most typically want to
    347   // override in architecture-specific optimized specializations.
    348   void Pack(PackedSideBlock* dst, int start_width,
    349             RoundingOffsetGenerator* rounding_offset_generator) {
    350     std::uint8_t* dst_ptr = dst->current_data();
    351     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
    352          cell_start_depth += kCellDepth) {
    353       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
    354            cell_start_width += kCellWidth) {
    355         std::int32_t* cell_sums_of_each_slice_ptr =
    356             dst->sums_of_each_slice() + start_width + cell_start_width;
    357         const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
    358             complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
    359                                 kCellDepth));
    360         for (int w = 0; w < kCellWidth; w++) {
    361           std::int32_t sum = 0;
    362           for (int d = 0; d < kCellDepth; d++) {
    363             const std::uint8_t raw_src_val = src_cell_map(w, d);
    364             const std::uint8_t requantized = Requantize<QuantizationParams>(
    365                 raw_src_val, rounding_offset_generator);
    366             dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized;
    367             sum += requantized;
    368           }
    369           cell_sums_of_each_slice_ptr[w] += sum;
    370         }
    371         dst_ptr += kCellSize;
    372       }
    373     }
    374     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
    375   }
    376 };
    377 
    378 template <typename QuantizationParams, typename SrcMapType,
    379           typename PackedSideBlock>
    380 class PackingRegisterBlock
    381     : public PackingRegisterBlockBase<QuantizationParams, SrcMapType,
    382                                       PackedSideBlock> {};
    383 
    384 // Large-scale implementation of packing.
    385 template <typename QuantizationParams, typename SrcMapType,
    386           typename PackedSideBlock>
    387 class PackSideBlockImpl {
    388  public:
    389   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
    390   typedef typename KernelSideFormat::Cell CellFormat;
    391   static const int kCells = KernelSideFormat::kCells;
    392   static const int kCellWidth = CellFormat::kWidth;
    393   static const int kKernelWidth = CellFormat::kWidth * kCells;
    394   static const int kCellDepth = CellFormat::kDepth;
    395 
    396   typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock>
    397       PackingRegisterBlockType;
    398   typedef typename PackingRegisterBlockType::RoundingOffsetGenerator
    399       RoundingOffsetGenerator;
    400 
    401   PackSideBlockImpl(PackedSideBlock* packed_side_block,
    402                     const SrcMapType& src_map)
    403       : packed_side_block_(packed_side_block), src_map_(src_map) {}
    404 
    405   PackedSideBlock* packed_side_block() const { return packed_side_block_; }
    406 
    407   const SrcMapType& src_map() const { return src_map_; }
    408 
    409   // The public entry point to pack a block.
    410   void PackL2() {
    411     memset(packed_side_block_->sums_of_each_slice(), 0,
    412            sizeof(std::int32_t) * packed_side_block_->params().l2_width);
    413     for (int d = 0; d < src_map_.depth();
    414          d += packed_side_block_->params().l1_depth) {
    415       int ds = std::min<int>(packed_side_block_->params().l1_depth,
    416                              src_map_.depth() - d);
    417 
    418       for (int w = 0; w < src_map_.width();
    419            w += packed_side_block_->params().l1_width) {
    420         int ws = std::min<int>(packed_side_block_->params().l1_width,
    421                                src_map_.width() - w);
    422 
    423         PrefetchL1(w, ws, d, ds);
    424         PackL1(w, ws, d, ds);
    425       }
    426     }
    427   }
    428 
    429  protected:
    430   // The intermediate-level loops, between PackL2 and PackRun.
    431   void PackL1(int start_width, int width, int start_depth, int depth) {
    432     for (int w = 0; w < width; w += kKernelWidth) {
    433       int ws = std::min(+kKernelWidth, width - w);
    434       packed_side_block_->seek_run(start_width + w, start_depth);
    435       PackRun(start_width + w, ws, start_depth, depth);
    436     }
    437   }
    438 
    439   // Prefetches the data that will be read by PackL1
    440   void PrefetchL1(int start_width, int width, int start_depth, int depth) {
    441     if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
    442       for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
    443         for (int w = 0; w < width; w += 1) {
    444           Prefetch(src_map_.data(start_width + w, start_depth + d));
    445         }
    446       }
    447     } else {
    448       for (int d = 0; d < depth; d++) {
    449         for (int w = 0; w < width; w += kDefaultCacheLineSize) {
    450           Prefetch(src_map_.data(start_width + w, start_depth + d));
    451         }
    452       }
    453     }
    454   }
    455 
    456   // PackRun packs only a run i.e. is the inner loop in the depth dimension.
    457   void PackRun(int start_width, int width, int start_depth, int depth) {
    458     PackingRegisterBlockType b;
    459     if (width == kKernelWidth) {
    460       const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
    461       if (register_aligned_depth) {
    462         for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
    463           b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
    464                                                  width, kRegisterSize));
    465           b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
    466         }
    467       }
    468       if (register_aligned_depth < depth) {
    469         b.MakeCompleteSrc(
    470             src_map_.block(start_width, start_depth + register_aligned_depth,
    471                            width, depth - register_aligned_depth));
    472         b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
    473       }
    474     } else {
    475       assert(width < kKernelWidth);
    476       for (int d = 0; d < depth; d += kRegisterSize) {
    477         const int ds = std::min(+kRegisterSize, depth - d);
    478         b.MakeCompleteSrc(
    479             src_map_.block(start_width, start_depth + d, width, ds));
    480         b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
    481       }
    482     }
    483   }
    484 
    485   // The PackedSideBlock being packed, i.e. the 'destination'.
    486   PackedSideBlock* const packed_side_block_;
    487 
    488   // A map on the block of the original matrix block being packed,
    489   // i.e. the 'source'.
    490   const SrcMapType& src_map_;
    491 
    492   // Used for requantization in the less-than-8-bit case.
    493   // Otherwise unused.
    494   RoundingOffsetGenerator rounding_offset_generator_;
    495 };
    496 
    497 // Quantization parameters for the side (LHS or RHS) being packed,
    498 // with the rounding strategy having been already resolved to a specific
    499 // rounding mode.
    500 template <typename tBitDepth, RoundingMode tRoundingMode>
    501 struct QuantizationParams {
    502   typedef tBitDepth BitDepth;
    503   static const RoundingMode kRoundingMode = tRoundingMode;
    504 };
    505 
    506 // Packs a block of the input LHS matrix, into a PackedSideBlock
    507 template <typename BitDepthParams, typename PackedSideBlock,
    508           typename MatrixMapType>
    509 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
    510   ScopedProfilingLabel label("pack LHS");
    511   static const SideMapOrder kSideMapOrder =
    512       MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
    513                                                   : SideMapOrder::DepthMajor;
    514   typedef typename MatrixMapType::Scalar Scalar;
    515   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
    516   SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
    517   typedef typename BitDepthParams::LhsBitDepth BitDepth;
    518   typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
    519   const int accumulation_depth = src_side_map.depth();
    520   if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
    521     typedef QuantizationParams<BitDepth,
    522                                RoundingStrategy::kRoundingModeForSmallSizes>
    523         QParams;
    524     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
    525     ImplType impl(dst, src_side_map);
    526     impl.PackL2();
    527   } else {
    528     typedef QuantizationParams<BitDepth,
    529                                RoundingStrategy::kRoundingModeForLargeSizes>
    530         QParams;
    531     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
    532     ImplType impl(dst, src_side_map);
    533     impl.PackL2();
    534   }
    535 }
    536 
    537 // Packs a block of the input RHS matrix, into a PackedSideBlock
    538 template <typename BitDepthParams, typename PackedSideBlock,
    539           typename MatrixMapType>
    540 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
    541   ScopedProfilingLabel label("pack RHS");
    542   static const SideMapOrder kSideMapOrder =
    543       MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
    544                                                   : SideMapOrder::DepthMajor;
    545   typedef typename MatrixMapType::Scalar Scalar;
    546   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
    547   SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
    548   typedef typename BitDepthParams::RhsBitDepth BitDepth;
    549   typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
    550   const int accumulation_depth = src_side_map.depth();
    551   if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
    552     typedef QuantizationParams<BitDepth,
    553                                RoundingStrategy::kRoundingModeForSmallSizes>
    554         QParams;
    555     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
    556     ImplType impl(dst, src_side_map);
    557     impl.PackL2();
    558   } else {
    559     typedef QuantizationParams<BitDepth,
    560                                RoundingStrategy::kRoundingModeForLargeSizes>
    561         QParams;
    562     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
    563     ImplType impl(dst, src_side_map);
    564     impl.PackL2();
    565   }
    566 }
    567 
    568 }  // namespace gemmlowp
    569 
    570 #ifdef GEMMLOWP_NEON
    571 #include "pack_neon.h"
    572 #elif defined(GEMMLOWP_SSE4)
    573 #include "pack_SSE.h"
    574 #endif
    575 
    576 #endif  // GEMMLOWP_INTERNAL_PACK_H_
    577