1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // single_thread_gemm.h: Single-threaded GEMM implementation. 16 // This is a good place to start reading code, as it shows the overall 17 // structure of a GEMM and is much simpler than multi_thread_gemm.h. 18 19 #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 20 #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 21 22 #include <cassert> 23 24 #include "../public/map.h" 25 #include "allocator.h" 26 #include "compute.h" 27 #include "kernel.h" 28 #include "pack.h" 29 #include "unpack.h" 30 31 #ifdef GEMMLOWP_PROFILING_SIZES 32 #ifndef GEMMLOWP_PROFILING 33 #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING 34 #endif 35 #include <string> 36 #include <unordered_map> 37 #endif 38 39 namespace gemmlowp { 40 41 class SingleThreadGemmContext { 42 public: 43 Allocator* allocator() { return &allocator_; } 44 45 void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; } 46 void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; } 47 void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; } 48 49 int l1_bytes_to_use() const { return l1_bytes_to_use_; } 50 int l2_bytes_to_use() const { return l2_bytes_to_use_; } 51 float l2_rhs_factor() const { return l2_rhs_factor_; } 52 53 protected: 54 Allocator allocator_; 55 56 // The cache configurationt to use. 57 int l1_bytes_to_use_ = kDefaultL1CacheSize; 58 int l2_bytes_to_use_ = kDefaultL2CacheSize; 59 float l2_rhs_factor_ = kDefaultL2RhsFactor; 60 }; 61 62 template <typename KernelFormat, typename InputScalar, typename OutputScalar, 63 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, 64 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, 65 typename OutputPipelineType> 66 void SingleThreadGemm(SingleThreadGemmContext* context, 67 const KernelBase& kernel, 68 const MatrixMap<const InputScalar, LhsOrder>& lhs, 69 const MatrixMap<const InputScalar, RhsOrder>& rhs, 70 MatrixMap<OutputScalar, ResultOrder>* result, 71 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 72 const OutputPipelineType& output_pipeline) { 73 ScopedProfilingLabel label("gemmlowp::SingleThreadGemm"); 74 75 assert(lhs.cols() == rhs.rows()); 76 77 int rows = result->rows(); 78 int cols = result->cols(); 79 int depth = lhs.cols(); 80 81 // zero sizes should have been caught earlier and early-returned. 82 assert(rows > 0); 83 assert(cols > 0); 84 assert(depth > 0); 85 86 // The case of rows<cols should have been caught earlier and transposed. 87 assert(rows >= cols); 88 89 Allocator* allocator = context->allocator(); 90 91 BlockParams block_params; 92 block_params.Init<KernelFormat>( 93 rows, cols, depth, 1, context->l1_bytes_to_use(), 94 context->l2_bytes_to_use(), context->l2_rhs_factor()); 95 96 #ifdef GEMMLOWP_PROFILING_SIZES 97 // Using a static map of label strings. Not reentrant at all! 98 static std::unordered_map<std::uint64_t, std::string> labels_map; 99 std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^ 100 (static_cast<std::uint64_t>(depth) << 16) ^ 101 (static_cast<std::uint64_t>(cols) << 32); 102 if (!labels_map.count(sizes_hash)) { 103 char label[256]; 104 snprintf(label, sizeof(label), 105 "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, " 106 "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)", 107 rows, depth, cols, block_params.l2_rows, block_params.l2_depth, 108 block_params.l2_cols, block_params.l1_rows, block_params.l1_depth, 109 block_params.l1_cols); 110 labels_map[sizes_hash] = label; 111 } 112 ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str()); 113 #endif 114 115 PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator, 116 block_params); 117 PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator, 118 block_params); 119 120 PackedResult packed_result(allocator, block_params); 121 122 allocator->Commit(); 123 124 const bool pack_rhs_once = block_params.l2_cols >= cols; 125 126 if (pack_rhs_once) { 127 PackRhs(&packed_rhs, rhs); 128 } 129 130 for (int r = 0; r < rows; r += block_params.l2_rows) { 131 int rs = std::min(block_params.l2_rows, rows - r); 132 133 PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); 134 135 for (int c = 0; c < cols; c += block_params.l2_cols) { 136 int cs = std::min(block_params.l2_cols, cols - c); 137 138 if (!pack_rhs_once) { 139 PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); 140 } 141 142 Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, 143 depth); 144 145 UnpackResult<KernelFormat>( 146 result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth, 147 packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), 148 lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline); 149 } 150 } 151 152 allocator->Decommit(); 153 } 154 155 } // namespace gemmlowp 156 157 #endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 158