1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // pack.h: packing blocks of the LHS and RHS into the data layout 16 // that is expected by compute.h and eventually by kernels. 17 // Because this data layout depends on the kernel format, code here 18 // is templated in KernelLhsFormat/KernelRhsFormat. 19 // 20 // Readers note: an important theme around here is that we try hard 21 // to handle both Lhs and Rhs with a single piece of code. We indifferently 22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices 23 // by (row, column) indices, we address them by (width, depth), as explained 24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, 25 // at once. 26 27 #ifndef GEMMLOWP_INTERNAL_PACK_H_ 28 #define GEMMLOWP_INTERNAL_PACK_H_ 29 30 #include <cstring> 31 32 #include "../public/bit_depth.h" 33 #include "allocator.h" 34 #include "block_params.h" 35 #include "common.h" 36 #include "kernel.h" 37 38 namespace gemmlowp { 39 40 // A PackedSideBlock instance is a packed block of either the LHS or RHS 41 // (whence the generic 'Side' name). 42 // 43 // 'Packed' means that it is laid out in the storage order that 44 // is expected by the specified kernel format. From a block of the input 45 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() 46 // or PackRhs(). 47 template <typename tKernelSideFormat> 48 class PackedSideBlock { 49 public: 50 typedef tKernelSideFormat KernelSideFormat; 51 52 PackedSideBlock(Side side, Allocator* allocator, 53 const BlockParams& block_params) 54 : allocator_(allocator), 55 pos_(0) { 56 GetSideBlockParams(side, ¶ms_, block_params); 57 data_handle_ = 58 allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); 59 sums_of_each_slice_handle_ = 60 allocator_->Reserve<std::int32_t>(params_.l2_width); 61 } 62 63 ~PackedSideBlock() {} 64 65 void seek_run(int start_width, int start_depth) const { 66 int kernel_run_depth = 67 std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); 68 pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; 69 } 70 71 void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } 72 73 void seek_forward_n_cells(int n) const { 74 pos_ += n * KernelSideFormat::Cell::kSize; 75 } 76 77 const std::uint8_t* current_data() const { 78 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; 79 } 80 81 std::uint8_t* current_data() { 82 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; 83 } 84 85 std::int32_t* sums_of_each_slice() { 86 return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); 87 } 88 89 const std::int32_t* sums_of_each_slice() const { 90 return allocator_->GetPointer<const std::int32_t>( 91 sums_of_each_slice_handle_); 92 } 93 94 const SideBlockParams& params() const { return params_; } 95 96 private: 97 // The block size parameters that this PackedSizeBlock follows. 98 // The L2 parameters determine its overall size, while the L1 parameters, 99 // together with the kernel format template parameter, determine 100 // the fine details of the storage/traversal order. 101 SideBlockParams params_; 102 103 // Pointer to the allocator provided by the caller. Not owned. 104 // The Allocator is assumed to outlive the PackedSideBlock. 105 Allocator* const allocator_; 106 107 // Handle on the buffer backing this packed block. Owned. 108 Allocator::Handle data_handle_; 109 110 // Handle on the additional buffer backing the vector of sums of slices 111 // associated with this block. Owned. 112 Allocator::Handle sums_of_each_slice_handle_; 113 114 // pos_ is the current position in the buffer, which we access 115 // sequentially, like a file. 116 // The idea is that we pack data in the same order as it is 117 // going to be traversed during the computation, which for 118 // cache-friendliness reasons is complicated to random-access, 119 // as the offsets calculations would be intricate. So we 120 // give up random-access addressing, and instead content ourselves 121 // with sequential access. 122 // 123 // pos_ is mutable because during the computation we will want to 124 // be able to iterate on the data in a const PackedSideBlock. 125 mutable int pos_; 126 }; 127 128 // WidthMajor and DepthMajor are custom phrases modelled after the 129 // standard terminology 'row-major' and 'column-major'. Their meaning 130 // should be transparent once one has read the explanation in kernel.h: 131 // for example, in the Lhs, the 'width' dimension is the rows dimension, 132 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite. 133 // Another way to put it: WidthMajor means that contiguous storage is used 134 // for entries having the same 'width' index. 135 enum class SideMapOrder { WidthMajor, DepthMajor }; 136 137 // Similar to MatrixMap from map.h, but in terms of width/depth instead of 138 // rows/columns. Used to address blocks of the input LHS/RHS matrices when 139 // packing them. 140 template <typename tScalar, SideMapOrder tOrder> 141 class SideMap { 142 public: 143 typedef tScalar Scalar; 144 static const SideMapOrder kOrder = tOrder; 145 146 SideMap(Scalar* data, int width, int depth, int stride) 147 : data_(data), width_(width), depth_(depth), stride_(stride) {} 148 149 SideMap(Scalar* data, int width, int depth) 150 : data_(data), width_(width), depth_(depth) { 151 stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; 152 } 153 154 SideMap(const SideMap& other) 155 : data_(other.data_), 156 width_(other.width_), 157 depth_(other.depth_), 158 stride_(other.stride_) {} 159 160 int width() const { return width_; } 161 int depth() const { return depth_; } 162 int stride() const { return stride_; } 163 int width_stride() const { 164 return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; 165 } 166 int depth_stride() const { 167 return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; 168 } 169 Scalar* data() const { return data_; } 170 Scalar* data(int w, int d) const { 171 return data_ + w * width_stride() + d * depth_stride(); 172 } 173 Scalar operator()(int w, int d) const { return *data(w, d); } 174 Scalar& operator()(int w, int d) { return *data(w, d); } 175 176 SideMap block(int start_width, int start_depth, int block_width, 177 int block_depth) const { 178 assert(start_width >= 0); 179 assert(start_width + block_width <= width_); 180 assert(start_depth >= 0); 181 assert(start_depth + block_depth <= depth_); 182 183 return SideMap(data(start_width, start_depth), block_width, block_depth, 184 stride_); 185 } 186 187 private: 188 Scalar* data_; // not owned. 189 int width_, depth_, stride_; 190 }; 191 192 template <RoundingMode tRoundingMode> 193 class ScalarRoundingOffsetGenerator { 194 public: 195 std::uint8_t get() { 196 assert(false); // This generic path should never be called. 197 return 0; 198 } 199 }; 200 201 // A RoundingOffsetGenerator for rounding-to-nearest, always returning 202 // the midpoint value 127. 203 template <> 204 class ScalarRoundingOffsetGenerator<RoundingMode::Nearest> { 205 public: 206 std::uint8_t get() { return 127; } 207 }; 208 209 // A RoundingOffsetGenerator based on a 8-bit Xorshift. 210 // This gives good results as Xorshift naturally generates 211 // uniform random *nonzero* bytes i.e. 255 different values, 212 // so it only remains for us to subtract one. 213 template <> 214 class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> { 215 public: 216 ScalarRoundingOffsetGenerator() { x_ = 128; } 217 218 std::uint8_t get() { 219 std::uint8_t result = x_ - 1; 220 // Xorshift8(7,5,3) 221 x_ ^= x_ << 7; 222 x_ ^= x_ >> 5; 223 x_ ^= x_ << 3; 224 return result; 225 } 226 227 private: 228 // State 229 std::uint8_t x_; 230 }; 231 232 // A RoundingOffsetGenerator based on an 8-bit add/mod 233 // low-discrepancy sequence. See less-than-8-bit.txt for 234 // an explanation (the constant 97 is important - it must 235 // be both relatively prime to 255, in order for the sequence 236 // to be full-period, and c/255 should be close to 0.38 to 237 // obtain low discrepancy). Uses a small bit hack to avoid 238 // expensive % operations. 239 template <> 240 class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> { 241 static const uint8_t AddConst = 97; 242 243 public: 244 ScalarRoundingOffsetGenerator() { x_ = 1; } // Start must be non-zero 245 246 std::uint8_t get() { 247 // The +'d boolean term causes the increment to skip over 255, 248 // (recalling that 255+1 = 256 = 0 for an 8 bit uint), 249 // thus implementing %255 250 x_ += (AddConst + (x_ >= (255 - AddConst))); 251 return x_; 252 } 253 254 private: 255 // State 256 std::uint8_t x_; 257 }; 258 259 // Requantizes a source uint8 value in [0..255] range 260 // to the range specified by BitDepth, [0..((2^bits)-1)]. 261 // Bias must be avoided. Currently this is achieved 262 // by probabilistic rounding. 263 template <typename QuantizationParams> 264 std::uint8_t Requantize( 265 std::uint8_t raw_src_val, 266 ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>* 267 rounding_offset_generator) { 268 static const int kBits = QuantizationParams::BitDepth::kBits; 269 static const std::uint8_t kMaxVal = (1 << kBits) - 1; 270 271 if (kBits == 8) { 272 return raw_src_val; 273 } 274 275 std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal; 276 std::uint8_t rounding_offset = rounding_offset_generator->get(); 277 return (scaled + rounding_offset) / 255; 278 } 279 280 // A PackingRegisterBlock is a small fixed-size block of a matrix being 281 // packed. This class is the generic non-optimized implementation, 282 // it is inherited by the generic implementation of PackingRegisterBlock, 283 // which may be overriden by template specialization. Overriding it is how 284 // one may provide optimized packing code paths. 285 // 286 // The packing of a block proceeds in two steps: 287 // 1. Ensuring that we have a complete block of source data, i.e. a block of 288 // the compile-time prescribed size. This is where we handle unaligned 289 // boundaries: if we don't have a complete block of source data, then 290 // we copy and zero-extend it into a local temporary (complete_src_), 291 // see MakeCompleteSrc. In the generic case, we do have a complete block, 292 // so we just use it in-place, see UseCompleteSrcInPlace. 293 // 2. Packing a complete block into the destination, see Pack. This is the 294 // most critical part, so it's convenient that unaligned boundaries have 295 // already been handled in step 1. 296 template <typename QuantizationParams, typename SrcMapType, 297 typename PackedSideBlock> 298 class PackingRegisterBlockBase { 299 public: 300 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; 301 typedef typename KernelSideFormat::Cell CellFormat; 302 static const int kCells = KernelSideFormat::kCells; 303 static const int kCellWidth = CellFormat::kWidth; 304 static const int kKernelWidth = CellFormat::kWidth * kCells; 305 static const int kCellDepth = CellFormat::kDepth; 306 static const int kCellSize = CellFormat::kSize; 307 static const SideMapOrder kSrcOrder = SrcMapType::kOrder; 308 309 typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode> 310 RoundingOffsetGenerator; 311 312 PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} 313 314 protected: 315 // The source data that's ready for packing. May point to 316 // in-place actual source data if it's already a complete block, 317 // (see UseCompleteSrcInPlace) 318 // or to the local buf_ below into which we copy incomplete blocks 319 // (see MakeCompleteSrc) 320 SrcMapType complete_src_; 321 322 // Temporary buffer for loading incomplete blocks to, 323 // in the source storage order 324 std::uint8_t buf_[kKernelWidth * kRegisterSize]; 325 326 public: 327 // Selects a block if in-place source data that's already a complete block 328 void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } 329 // Copies an incomplete block of source data into a local temporary 330 // complete block by zero-extending it. 331 void MakeCompleteSrc(const SrcMapType& src) { 332 memset(buf_, 0, kKernelWidth * kRegisterSize); 333 if (kSrcOrder == SideMapOrder::WidthMajor) { 334 for (int w = 0; w < src.width(); w++) { 335 memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); 336 } 337 } else { 338 assert(kSrcOrder == SideMapOrder::DepthMajor); 339 for (int d = 0; d < src.depth(); d++) { 340 memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); 341 } 342 } 343 complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize); 344 } 345 // Packs a complete block into the destination. This is the most 346 // critical part and the part that we most typically want to 347 // override in architecture-specific optimized specializations. 348 void Pack(PackedSideBlock* dst, int start_width, 349 RoundingOffsetGenerator* rounding_offset_generator) { 350 std::uint8_t* dst_ptr = dst->current_data(); 351 for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; 352 cell_start_depth += kCellDepth) { 353 for (int cell_start_width = 0; cell_start_width < kKernelWidth; 354 cell_start_width += kCellWidth) { 355 std::int32_t* cell_sums_of_each_slice_ptr = 356 dst->sums_of_each_slice() + start_width + cell_start_width; 357 const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( 358 complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, 359 kCellDepth)); 360 for (int w = 0; w < kCellWidth; w++) { 361 std::int32_t sum = 0; 362 for (int d = 0; d < kCellDepth; d++) { 363 const std::uint8_t raw_src_val = src_cell_map(w, d); 364 const std::uint8_t requantized = Requantize<QuantizationParams>( 365 raw_src_val, rounding_offset_generator); 366 dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized; 367 sum += requantized; 368 } 369 cell_sums_of_each_slice_ptr[w] += sum; 370 } 371 dst_ptr += kCellSize; 372 } 373 } 374 dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); 375 } 376 }; 377 378 template <typename QuantizationParams, typename SrcMapType, 379 typename PackedSideBlock> 380 class PackingRegisterBlock 381 : public PackingRegisterBlockBase<QuantizationParams, SrcMapType, 382 PackedSideBlock> {}; 383 384 // Large-scale implementation of packing. 385 template <typename QuantizationParams, typename SrcMapType, 386 typename PackedSideBlock> 387 class PackSideBlockImpl { 388 public: 389 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; 390 typedef typename KernelSideFormat::Cell CellFormat; 391 static const int kCells = KernelSideFormat::kCells; 392 static const int kCellWidth = CellFormat::kWidth; 393 static const int kKernelWidth = CellFormat::kWidth * kCells; 394 static const int kCellDepth = CellFormat::kDepth; 395 396 typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock> 397 PackingRegisterBlockType; 398 typedef typename PackingRegisterBlockType::RoundingOffsetGenerator 399 RoundingOffsetGenerator; 400 401 PackSideBlockImpl(PackedSideBlock* packed_side_block, 402 const SrcMapType& src_map) 403 : packed_side_block_(packed_side_block), src_map_(src_map) {} 404 405 PackedSideBlock* packed_side_block() const { return packed_side_block_; } 406 407 const SrcMapType& src_map() const { return src_map_; } 408 409 // The public entry point to pack a block. 410 void PackL2() { 411 memset(packed_side_block_->sums_of_each_slice(), 0, 412 sizeof(std::int32_t) * packed_side_block_->params().l2_width); 413 for (int d = 0; d < src_map_.depth(); 414 d += packed_side_block_->params().l1_depth) { 415 int ds = std::min<int>(packed_side_block_->params().l1_depth, 416 src_map_.depth() - d); 417 418 for (int w = 0; w < src_map_.width(); 419 w += packed_side_block_->params().l1_width) { 420 int ws = std::min<int>(packed_side_block_->params().l1_width, 421 src_map_.width() - w); 422 423 PrefetchL1(w, ws, d, ds); 424 PackL1(w, ws, d, ds); 425 } 426 } 427 } 428 429 protected: 430 // The intermediate-level loops, between PackL2 and PackRun. 431 void PackL1(int start_width, int width, int start_depth, int depth) { 432 for (int w = 0; w < width; w += kKernelWidth) { 433 int ws = std::min(+kKernelWidth, width - w); 434 packed_side_block_->seek_run(start_width + w, start_depth); 435 PackRun(start_width + w, ws, start_depth, depth); 436 } 437 } 438 439 // Prefetches the data that will be read by PackL1 440 void PrefetchL1(int start_width, int width, int start_depth, int depth) { 441 if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { 442 for (int d = 0; d < depth; d += kDefaultCacheLineSize) { 443 for (int w = 0; w < width; w += 1) { 444 Prefetch(src_map_.data(start_width + w, start_depth + d)); 445 } 446 } 447 } else { 448 for (int d = 0; d < depth; d++) { 449 for (int w = 0; w < width; w += kDefaultCacheLineSize) { 450 Prefetch(src_map_.data(start_width + w, start_depth + d)); 451 } 452 } 453 } 454 } 455 456 // PackRun packs only a run i.e. is the inner loop in the depth dimension. 457 void PackRun(int start_width, int width, int start_depth, int depth) { 458 PackingRegisterBlockType b; 459 if (width == kKernelWidth) { 460 const int register_aligned_depth = RoundDown<kRegisterSize>(depth); 461 if (register_aligned_depth) { 462 for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { 463 b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, 464 width, kRegisterSize)); 465 b.Pack(packed_side_block_, start_width, &rounding_offset_generator_); 466 } 467 } 468 if (register_aligned_depth < depth) { 469 b.MakeCompleteSrc( 470 src_map_.block(start_width, start_depth + register_aligned_depth, 471 width, depth - register_aligned_depth)); 472 b.Pack(packed_side_block_, start_width, &rounding_offset_generator_); 473 } 474 } else { 475 assert(width < kKernelWidth); 476 for (int d = 0; d < depth; d += kRegisterSize) { 477 const int ds = std::min(+kRegisterSize, depth - d); 478 b.MakeCompleteSrc( 479 src_map_.block(start_width, start_depth + d, width, ds)); 480 b.Pack(packed_side_block_, start_width, &rounding_offset_generator_); 481 } 482 } 483 } 484 485 // The PackedSideBlock being packed, i.e. the 'destination'. 486 PackedSideBlock* const packed_side_block_; 487 488 // A map on the block of the original matrix block being packed, 489 // i.e. the 'source'. 490 const SrcMapType& src_map_; 491 492 // Used for requantization in the less-than-8-bit case. 493 // Otherwise unused. 494 RoundingOffsetGenerator rounding_offset_generator_; 495 }; 496 497 // Quantization parameters for the side (LHS or RHS) being packed, 498 // with the rounding strategy having been already resolved to a specific 499 // rounding mode. 500 template <typename tBitDepth, RoundingMode tRoundingMode> 501 struct QuantizationParams { 502 typedef tBitDepth BitDepth; 503 static const RoundingMode kRoundingMode = tRoundingMode; 504 }; 505 506 // Packs a block of the input LHS matrix, into a PackedSideBlock 507 template <typename BitDepthParams, typename PackedSideBlock, 508 typename MatrixMapType> 509 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { 510 ScopedProfilingLabel label("pack LHS"); 511 static const SideMapOrder kSideMapOrder = 512 MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor 513 : SideMapOrder::DepthMajor; 514 typedef typename MatrixMapType::Scalar Scalar; 515 typedef SideMap<Scalar, kSideMapOrder> SideMapType; 516 SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); 517 typedef typename BitDepthParams::LhsBitDepth BitDepth; 518 typedef typename BitDepthParams::RoundingStrategy RoundingStrategy; 519 const int accumulation_depth = src_side_map.depth(); 520 if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) { 521 typedef QuantizationParams<BitDepth, 522 RoundingStrategy::kRoundingModeForSmallSizes> 523 QParams; 524 typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 525 ImplType impl(dst, src_side_map); 526 impl.PackL2(); 527 } else { 528 typedef QuantizationParams<BitDepth, 529 RoundingStrategy::kRoundingModeForLargeSizes> 530 QParams; 531 typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 532 ImplType impl(dst, src_side_map); 533 impl.PackL2(); 534 } 535 } 536 537 // Packs a block of the input RHS matrix, into a PackedSideBlock 538 template <typename BitDepthParams, typename PackedSideBlock, 539 typename MatrixMapType> 540 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { 541 ScopedProfilingLabel label("pack RHS"); 542 static const SideMapOrder kSideMapOrder = 543 MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor 544 : SideMapOrder::DepthMajor; 545 typedef typename MatrixMapType::Scalar Scalar; 546 typedef SideMap<Scalar, kSideMapOrder> SideMapType; 547 SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); 548 typedef typename BitDepthParams::RhsBitDepth BitDepth; 549 typedef typename BitDepthParams::RoundingStrategy RoundingStrategy; 550 const int accumulation_depth = src_side_map.depth(); 551 if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) { 552 typedef QuantizationParams<BitDepth, 553 RoundingStrategy::kRoundingModeForSmallSizes> 554 QParams; 555 typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 556 ImplType impl(dst, src_side_map); 557 impl.PackL2(); 558 } else { 559 typedef QuantizationParams<BitDepth, 560 RoundingStrategy::kRoundingModeForLargeSizes> 561 QParams; 562 typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 563 ImplType impl(dst, src_side_map); 564 impl.PackL2(); 565 } 566 } 567 568 } // namespace gemmlowp 569 570 #ifdef GEMMLOWP_NEON 571 #include "pack_neon.h" 572 #elif defined(GEMMLOWP_SSE4) 573 #include "pack_SSE.h" 574 #endif 575 576 #endif // GEMMLOWP_INTERNAL_PACK_H_ 577