1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // block_params.h: Logic to choose L1 and L2 block sizes 16 // to optimize cache-friendliness. 17 18 #ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ 19 #define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ 20 21 #include "common.h" 22 23 namespace gemmlowp { 24 25 // A BlockParams instance contains a full description of all the block size 26 // parameters to be used by a Gemm. 27 // There are two nested levels of block subdivisions: first a subdivision 28 // into large blocks that should fit in last-level cache (what we call L2 here) 29 // and then another subdivision into smaller blocks that should fit in 30 // L1 cache. There is then actually a third level of subdivision to fit 31 // in registers, but we are not concerned with that here. 32 struct BlockParams { 33 // L1 block parameters determine the size of small blocks that should 34 // fit in L1 cache. 35 int l1_rows; 36 int l1_cols; 37 int l1_depth; 38 39 // L2 block parameters determine the size of larger blocks that should 40 // fit in L2 cache. 41 int l2_rows; 42 int l2_cols; 43 int l2_depth; 44 45 template <typename KernelFormat> 46 void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use, 47 int l2_bytes_to_use, float l2_rhs_factor) { 48 FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, 49 l2_bytes_to_use, l2_rhs_factor, &l2_rows, 50 &l2_cols, &l2_depth); 51 FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use, 52 &l1_rows, &l1_cols, &l1_depth); 53 } 54 55 template <typename KernelFormat> 56 static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads, 57 int l2_bytes_to_use, float l2_rhs_factor, 58 int* out_l2_rows, int* out_l2_cols, 59 int* out_l2_depth) { 60 int l2_rows = 0; 61 int l2_cols = 0; 62 int l2_depth = 0; 63 64 int per_thread_rows = 65 std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads); 66 67 // No L2 blocking in the depth dimension at the moment. 68 // Too much loss of accuracy due to storing intermediate results in 69 // low precision. 70 // However, we still want to round l2_depth up to the next multiple 71 // of register size, so as to avoid having to special-case unaligned depths. 72 l2_depth = RoundUp<kRegisterSize>(depth); 73 74 { 75 int max_cache_friendly_l2_cols = std::max( 76 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth))); 77 int min_l2_cols_blocks = 78 std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols)); 79 l2_cols = 80 RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks)); 81 } 82 83 // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row 84 // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances 85 // the performance on x86. 86 if (l2_rhs_factor == 1.0f) { 87 l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows); 88 } else { 89 int max_cache_friendly_l2_rows = 90 std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) / 91 (num_threads * (l2_depth + 4 * l2_cols))); 92 int min_l2_rows_blocks = std::max( 93 1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows)); 94 l2_rows = RoundUp<KernelFormat::kRows>( 95 CeilQuotient(per_thread_rows, min_l2_rows_blocks)); 96 } 97 98 *out_l2_rows = l2_rows; 99 *out_l2_cols = l2_cols; 100 *out_l2_depth = l2_depth; 101 } 102 103 template <typename KernelFormat> 104 static void FindL1BlockSizes(int rows, int cols, int depth, 105 int l1_bytes_to_use, int* out_l1_rows, 106 int* out_l1_cols, int* out_l1_depth) { 107 int l1_rows = 0; 108 int l1_cols = 0; 109 int l1_depth = 0; 110 111 // L2 block sizes should already be multiples of kernel block sizes. 112 assert(rows % KernelFormat::kRows == 0); 113 assert(cols % KernelFormat::kCols == 0); 114 assert(depth % KernelFormat::kDepth == 0); 115 116 // No L1 blocking in the columns dimension at the moment. 117 // Thought not to be needed. Similar to Eigen. 118 l1_cols = cols; 119 120 { 121 int max_cache_friendly_l1_depth = std::max( 122 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) / 123 (KernelFormat::kRows + KernelFormat::kCols)); 124 int min_l1_depth_blocks = 125 std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth)); 126 l1_depth = 127 RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks)); 128 } 129 130 { 131 int max_cache_friendly_l1_rows = 132 std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols)); 133 int min_l1_rows_blocks = 134 std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows)); 135 l1_rows = 136 RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks)); 137 } 138 139 *out_l1_rows = l1_rows; 140 *out_l1_cols = l1_cols; 141 *out_l1_depth = l1_depth; 142 } 143 }; 144 145 // A SideBlockParams instance contains only the block params relevant to 146 // one side (LHS or RHS), expressed in terms of 'width' instead of 147 // rows/colums. See the explanation in kernel.h: in the LHS, 'width' means 148 // the number of rows, while in the RHS, 'width' means the number of columns. 149 // That allows us to write generic code that applies to either LHS or RHS. 150 struct SideBlockParams { 151 // L1 block parameters determine the size of small blocks that should 152 // fit in L1 cache. 153 int l1_width; 154 int l1_depth; 155 156 // L2 block parameters determine the size of larger blocks that should 157 // fit in L2 cache. 158 int l2_width; 159 int l2_depth; 160 }; 161 162 enum class Side { Lhs, Rhs }; 163 164 inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params, 165 const BlockParams& block_params) { 166 side_block_params->l1_width = 167 side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols; 168 side_block_params->l2_width = 169 side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols; 170 171 side_block_params->l1_depth = block_params.l1_depth; 172 side_block_params->l2_depth = block_params.l2_depth; 173 } 174 175 } // namespace gemmlowp 176 177 #endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ 178