1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // pack.h: packing blocks of the LHS and RHS into the data layout 16 // that is expected by compute.h and eventually by kernels. 17 // Because this data layout depends on the kernel format, code here 18 // is templated in KernelLhsFormat/KernelRhsFormat. 19 // 20 // Readers note: an important theme around here is that we try hard 21 // to handle both Lhs and Rhs with a single piece of code. We indifferently 22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices 23 // by (row, column) indices, we address them by (width, depth), as explained 24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, 25 // at once. 26 27 #ifndef GEMMLOWP_INTERNAL_PACK_H_ 28 #define GEMMLOWP_INTERNAL_PACK_H_ 29 30 #include <cstring> 31 32 #include "allocator.h" 33 #include "block_params.h" 34 #include "common.h" 35 #include "kernel.h" 36 37 namespace gemmlowp { 38 39 // A PackedSideBlock instance is a packed block of either the LHS or RHS 40 // (whence the generic 'Side' name). 41 // 42 // 'Packed' means that it is laid out in the storage order that 43 // is expected by the specified kernel format. From a block of the input 44 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() 45 // or PackRhs(). 46 template <typename tKernelSideFormat> 47 class PackedSideBlock { 48 public: 49 typedef tKernelSideFormat KernelSideFormat; 50 51 PackedSideBlock(Side side, Allocator* allocator, 52 const BlockParams& block_params) 53 : allocator_(allocator), pos_(0) { 54 GetSideBlockParams(side, ¶ms_, block_params); 55 data_handle_ = 56 allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); 57 sums_of_each_slice_handle_ = 58 allocator_->Reserve<std::int32_t>(params_.l2_width); 59 } 60 61 ~PackedSideBlock() {} 62 63 void seek_run(int start_width, int start_depth) const { 64 int kernel_run_depth = 65 std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); 66 pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; 67 } 68 69 void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } 70 71 void seek_forward_n_cells(int n) const { 72 pos_ += n * KernelSideFormat::Cell::kSize; 73 } 74 75 const std::uint8_t* current_data() const { 76 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; 77 } 78 79 std::uint8_t* current_data() { 80 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; 81 } 82 83 std::int32_t* sums_of_each_slice() { 84 return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); 85 } 86 87 const std::int32_t* sums_of_each_slice() const { 88 return allocator_->GetPointer<const std::int32_t>( 89 sums_of_each_slice_handle_); 90 } 91 92 const SideBlockParams& params() const { return params_; } 93 94 private: 95 // The block size parameters that this PackedSizeBlock follows. 96 // The L2 parameters determine its overall size, while the L1 parameters, 97 // together with the kernel format template parameter, determine 98 // the fine details of the storage/traversal order. 99 SideBlockParams params_; 100 101 // Pointer to the allocator provided by the caller. Not owned. 102 // The Allocator is assumed to outlive the PackedSideBlock. 103 Allocator* const allocator_; 104 105 // Handle on the buffer backing this packed block. Owned. 106 Allocator::Handle data_handle_; 107 108 // Handle on the additional buffer backing the vector of sums of slices 109 // associated with this block. Owned. 110 Allocator::Handle sums_of_each_slice_handle_; 111 112 // pos_ is the current position in the buffer, which we access 113 // sequentially, like a file. 114 // The idea is that we pack data in the same order as it is 115 // going to be traversed during the computation, which for 116 // cache-friendliness reasons is complicated to random-access, 117 // as the offsets calculations would be intricate. So we 118 // give up random-access addressing, and instead content ourselves 119 // with sequential access. 120 // 121 // pos_ is mutable because during the computation we will want to 122 // be able to iterate on the data in a const PackedSideBlock. 123 mutable int pos_; 124 }; 125 126 // WidthMajor and DepthMajor are custom phrases modelled after the 127 // standard terminology 'row-major' and 'column-major'. Their meaning 128 // should be transparent once one has read the explanation in kernel.h: 129 // for example, in the Lhs, the 'width' dimension is the rows dimension, 130 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite. 131 // Another way to put it: WidthMajor means that contiguous storage is used 132 // for entries having the same 'width' index. 133 enum class SideMapOrder { WidthMajor, DepthMajor }; 134 135 // Similar to MatrixMap from map.h, but in terms of width/depth instead of 136 // rows/columns. Used to address blocks of the input LHS/RHS matrices when 137 // packing them. 138 template <typename tScalar, SideMapOrder tOrder> 139 class SideMap { 140 public: 141 typedef tScalar Scalar; 142 static const SideMapOrder kOrder = tOrder; 143 144 SideMap(Scalar* data, int width, int depth, int stride) 145 : data_(data), width_(width), depth_(depth), stride_(stride) {} 146 147 SideMap(Scalar* data, int width, int depth) 148 : data_(data), width_(width), depth_(depth) { 149 stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; 150 } 151 152 SideMap(const SideMap& other) 153 : data_(other.data_), 154 width_(other.width_), 155 depth_(other.depth_), 156 stride_(other.stride_) {} 157 158 int width() const { return width_; } 159 int depth() const { return depth_; } 160 int stride() const { return stride_; } 161 int width_stride() const { 162 return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; 163 } 164 int depth_stride() const { 165 return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; 166 } 167 Scalar* data() const { return data_; } 168 Scalar* data(int w, int d) const { 169 return data_ + w * width_stride() + d * depth_stride(); 170 } 171 Scalar operator()(int w, int d) const { return *data(w, d); } 172 Scalar& operator()(int w, int d) { return *data(w, d); } 173 174 SideMap block(int start_width, int start_depth, int block_width, 175 int block_depth) const { 176 assert(start_width >= 0); 177 assert(start_width + block_width <= width_); 178 assert(start_depth >= 0); 179 assert(start_depth + block_depth <= depth_); 180 181 return SideMap(data(start_width, start_depth), block_width, block_depth, 182 stride_); 183 } 184 185 private: 186 Scalar* data_; // not owned. 187 int width_, depth_, stride_; 188 }; 189 190 // A PackingRegisterBlock is a small fixed-size block of a matrix being 191 // packed. This class is the generic non-optimized implementation, 192 // it is inherited by the generic implementation of PackingRegisterBlock, 193 // which may be overriden by template specialization. Overriding it is how 194 // one may provide optimized packing code paths. 195 // 196 // The packing of a block proceeds in two steps: 197 // 1. Ensuring that we have a complete block of source data, i.e. a block of 198 // the compile-time prescribed size. This is where we handle unaligned 199 // boundaries: if we don't have a complete block of source data, then 200 // we copy and zero-extend it into a local temporary (complete_src_), 201 // see MakeCompleteSrc. In the generic case, we do have a complete block, 202 // so we just use it in-place, see UseCompleteSrcInPlace. 203 // 2. Packing a complete block into the destination, see Pack. This is the 204 // most critical part, so it's convenient that unaligned boundaries have 205 // already been handled in step 1. 206 template <typename SrcMapType, typename PackedSideBlock> 207 class PackingRegisterBlockBase { 208 public: 209 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; 210 typedef typename KernelSideFormat::Cell CellFormat; 211 typedef typename KernelSideFormat::Scalar KernelScalar; 212 static const int kCells = KernelSideFormat::kCells; 213 static const int kCellWidth = CellFormat::kWidth; 214 static const int kKernelWidth = CellFormat::kWidth * kCells; 215 static const int kCellDepth = CellFormat::kDepth; 216 static const int kCellSize = CellFormat::kSize; 217 static const SideMapOrder kSrcOrder = SrcMapType::kOrder; 218 static const int kZeroPointInputValue = 219 ZeroPointInputValue<KernelScalar>::kValue; 220 221 PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} 222 223 protected: 224 // The source data that's ready for packing. May point to 225 // in-place actual source data if it's already a complete block, 226 // (see UseCompleteSrcInPlace) 227 // or to the local buf_ below into which we copy incomplete blocks 228 // (see MakeCompleteSrc) 229 SrcMapType complete_src_; 230 231 // Temporary buffer for loading incomplete blocks to, 232 // in the source storage order 233 std::uint8_t buf_[kKernelWidth * kRegisterSize]; 234 235 public: 236 // Selects a block if in-place source data that's already a complete block 237 void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } 238 // Copies an incomplete block of source data into a local temporary 239 // complete block by zero-extending it. 240 void MakeCompleteSrc(const SrcMapType& src) { 241 memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize); 242 if (kSrcOrder == SideMapOrder::WidthMajor) { 243 for (int w = 0; w < src.width(); w++) { 244 memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); 245 } 246 } else { 247 assert(kSrcOrder == SideMapOrder::DepthMajor); 248 for (int d = 0; d < src.depth(); d++) { 249 memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); 250 } 251 } 252 complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize); 253 } 254 // Packs a complete block into the destination. This is the most 255 // critical part and the part that we most typically want to 256 // override in architecture-specific optimized specializations. 257 void Pack(PackedSideBlock* dst, int start_width) { 258 std::uint8_t* dst_ptr = dst->current_data(); 259 for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; 260 cell_start_depth += kCellDepth) { 261 for (int cell_start_width = 0; cell_start_width < kKernelWidth; 262 cell_start_width += kCellWidth) { 263 std::int32_t* cell_sums_of_each_slice_ptr = 264 dst->sums_of_each_slice() + start_width + cell_start_width; 265 const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( 266 complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, 267 kCellDepth)); 268 for (int w = 0; w < kCellWidth; w++) { 269 std::int32_t sum = 0; 270 for (int d = 0; d < kCellDepth; d++) { 271 const std::uint8_t src_val = src_cell_map(w, d); 272 const std::int16_t kernel_val_unwrapped = 273 src_val - kZeroPointInputValue; 274 const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped; 275 dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8; 276 sum += kernel_val_unwrapped; 277 } 278 cell_sums_of_each_slice_ptr[w] += sum; 279 } 280 dst_ptr += kCellSize; 281 } 282 } 283 dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); 284 } 285 }; 286 287 template <typename SrcMapType, typename PackedSideBlock> 288 class PackingRegisterBlock 289 : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {}; 290 291 // Large-scale implementation of packing. 292 template <typename SrcMapType, typename PackedSideBlock> 293 class PackSideBlockImpl { 294 public: 295 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; 296 typedef typename KernelSideFormat::Cell CellFormat; 297 static const int kCells = KernelSideFormat::kCells; 298 static const int kCellWidth = CellFormat::kWidth; 299 static const int kKernelWidth = CellFormat::kWidth * kCells; 300 static const int kCellDepth = CellFormat::kDepth; 301 302 typedef PackingRegisterBlock<SrcMapType, PackedSideBlock> 303 PackingRegisterBlockType; 304 305 PackSideBlockImpl(PackedSideBlock* packed_side_block, 306 const SrcMapType& src_map) 307 : packed_side_block_(packed_side_block), src_map_(src_map) {} 308 309 PackedSideBlock* packed_side_block() const { return packed_side_block_; } 310 311 const SrcMapType& src_map() const { return src_map_; } 312 313 // The public entry point to pack a block. 314 void PackL2() { 315 memset(packed_side_block_->sums_of_each_slice(), 0, 316 sizeof(std::int32_t) * packed_side_block_->params().l2_width); 317 for (int d = 0; d < src_map_.depth(); 318 d += packed_side_block_->params().l1_depth) { 319 int ds = std::min<int>(packed_side_block_->params().l1_depth, 320 src_map_.depth() - d); 321 322 for (int w = 0; w < src_map_.width(); 323 w += packed_side_block_->params().l1_width) { 324 int ws = std::min<int>(packed_side_block_->params().l1_width, 325 src_map_.width() - w); 326 327 PrefetchL1(w, ws, d, ds); 328 PackL1(w, ws, d, ds); 329 } 330 } 331 } 332 333 protected: 334 // The intermediate-level loops, between PackL2 and PackRun. 335 void PackL1(int start_width, int width, int start_depth, int depth) { 336 for (int w = 0; w < width; w += kKernelWidth) { 337 int ws = std::min(+kKernelWidth, width - w); 338 packed_side_block_->seek_run(start_width + w, start_depth); 339 PackRun(start_width + w, ws, start_depth, depth); 340 } 341 } 342 343 // Prefetches the data that will be read by PackL1 344 void PrefetchL1(int start_width, int width, int start_depth, int depth) { 345 if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { 346 for (int d = 0; d < depth; d += kDefaultCacheLineSize) { 347 for (int w = 0; w < width; w += 1) { 348 Prefetch(src_map_.data(start_width + w, start_depth + d)); 349 } 350 } 351 } else { 352 for (int d = 0; d < depth; d++) { 353 for (int w = 0; w < width; w += kDefaultCacheLineSize) { 354 Prefetch(src_map_.data(start_width + w, start_depth + d)); 355 } 356 } 357 } 358 } 359 360 // PackRun packs only a run i.e. is the inner loop in the depth dimension. 361 void PackRun(int start_width, int width, int start_depth, int depth) { 362 PackingRegisterBlockType b; 363 if (width == kKernelWidth) { 364 const int register_aligned_depth = RoundDown<kRegisterSize>(depth); 365 if (register_aligned_depth) { 366 for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { 367 b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, 368 width, kRegisterSize)); 369 b.Pack(packed_side_block_, start_width); 370 } 371 } 372 if (register_aligned_depth < depth) { 373 b.MakeCompleteSrc( 374 src_map_.block(start_width, start_depth + register_aligned_depth, 375 width, depth - register_aligned_depth)); 376 b.Pack(packed_side_block_, start_width); 377 } 378 } else { 379 assert(width < kKernelWidth); 380 for (int d = 0; d < depth; d += kRegisterSize) { 381 const int ds = std::min(+kRegisterSize, depth - d); 382 b.MakeCompleteSrc( 383 src_map_.block(start_width, start_depth + d, width, ds)); 384 b.Pack(packed_side_block_, start_width); 385 } 386 } 387 } 388 389 // The PackedSideBlock being packed, i.e. the 'destination'. 390 PackedSideBlock* const packed_side_block_; 391 392 // A map on the block of the original matrix block being packed, 393 // i.e. the 'source'. 394 const SrcMapType& src_map_; 395 }; 396 397 // Packs a block of the input LHS matrix, into a PackedSideBlock 398 template <typename PackedSideBlock, typename MatrixMapType> 399 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { 400 ScopedProfilingLabel label("pack LHS"); 401 static const SideMapOrder kSideMapOrder = 402 MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor 403 : SideMapOrder::DepthMajor; 404 typedef typename MatrixMapType::Scalar Scalar; 405 typedef SideMap<Scalar, kSideMapOrder> SideMapType; 406 SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); 407 typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; 408 ImplType impl(dst, src_side_map); 409 impl.PackL2(); 410 } 411 412 // Packs a block of the input RHS matrix, into a PackedSideBlock 413 template <typename PackedSideBlock, typename MatrixMapType> 414 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { 415 ScopedProfilingLabel label("pack RHS"); 416 static const SideMapOrder kSideMapOrder = 417 MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor 418 : SideMapOrder::DepthMajor; 419 typedef typename MatrixMapType::Scalar Scalar; 420 typedef SideMap<Scalar, kSideMapOrder> SideMapType; 421 SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); 422 typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; 423 ImplType impl(dst, src_side_map); 424 impl.PackL2(); 425 } 426 427 } // namespace gemmlowp 428 429 #ifdef GEMMLOWP_NEON 430 #include "pack_neon.h" 431 #elif defined(GEMMLOWP_SSE4) 432 #include "pack_sse.h" 433 #elif defined(GEMMLOWP_MSA) 434 #include "pack_msa.h" 435 #endif 436 437 #endif // GEMMLOWP_INTERNAL_PACK_H_ 438