Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // block_params.h: Logic to choose L1 and L2 block sizes
     16 // to optimize cache-friendliness.
     17 
     18 #ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
     19 #define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
     20 
     21 #include "common.h"
     22 
     23 namespace gemmlowp {
     24 
     25 // A BlockParams instance contains a full description of all the block size
     26 // parameters to be used by a Gemm.
     27 // There are two nested levels of block subdivisions: first a subdivision
     28 // into large blocks that should fit in last-level cache (what we call L2 here)
     29 // and then another subdivision into smaller blocks that should fit in
     30 // L1 cache. There is then actually a third level of subdivision to fit
     31 // in registers, but we are not concerned with that here.
     32 struct BlockParams {
     33   // L1 block parameters determine the size of small blocks that should
     34   // fit in L1 cache.
     35   int l1_rows;
     36   int l1_cols;
     37   int l1_depth;
     38 
     39   // L2 block parameters determine the size of larger blocks that should
     40   // fit in L2 cache.
     41   int l2_rows;
     42   int l2_cols;
     43   int l2_depth;
     44 
     45   template <typename KernelFormat>
     46   void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use,
     47             int l2_bytes_to_use, float l2_rhs_factor) {
     48     FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
     49                                    l2_bytes_to_use, l2_rhs_factor, &l2_rows,
     50                                    &l2_cols, &l2_depth);
     51     FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use,
     52                                    &l1_rows, &l1_cols, &l1_depth);
     53   }
     54 
     55   template <typename KernelFormat>
     56   static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
     57                                int l2_bytes_to_use, float l2_rhs_factor,
     58                                int* out_l2_rows, int* out_l2_cols,
     59                                int* out_l2_depth) {
     60     int l2_rows = 0;
     61     int l2_cols = 0;
     62     int l2_depth = 0;
     63 
     64     int per_thread_rows =
     65         std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads);
     66 
     67     // No L2 blocking in the depth dimension at the moment.
     68     // Too much loss of accuracy due to storing intermediate results in
     69     // low precision.
     70     // However, we still want to round l2_depth up to the next multiple
     71     // of register size, so as to avoid having to special-case unaligned depths.
     72     l2_depth = RoundUp<kRegisterSize>(depth);
     73 
     74     {
     75       int max_cache_friendly_l2_cols = std::max(
     76           1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
     77       int min_l2_cols_blocks =
     78           std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols));
     79       l2_cols =
     80           RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
     81     }
     82 
     83     // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row
     84     // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
     85     // the performance on x86.
     86     if (l2_rhs_factor == 1.0f) {
     87       l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows);
     88     } else {
     89       int max_cache_friendly_l2_rows =
     90           std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
     91                           (num_threads * (l2_depth + 4 * l2_cols)));
     92       int min_l2_rows_blocks = std::max(
     93           1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows));
     94       l2_rows = RoundUp<KernelFormat::kRows>(
     95           CeilQuotient(per_thread_rows, min_l2_rows_blocks));
     96     }
     97 
     98     *out_l2_rows = l2_rows;
     99     *out_l2_cols = l2_cols;
    100     *out_l2_depth = l2_depth;
    101   }
    102 
    103   template <typename KernelFormat>
    104   static void FindL1BlockSizes(int rows, int cols, int depth,
    105                                int l1_bytes_to_use, int* out_l1_rows,
    106                                int* out_l1_cols, int* out_l1_depth) {
    107     int l1_rows = 0;
    108     int l1_cols = 0;
    109     int l1_depth = 0;
    110 
    111     // L2 block sizes should already be multiples of kernel block sizes.
    112     assert(rows % KernelFormat::kRows == 0);
    113     assert(cols % KernelFormat::kCols == 0);
    114     assert(depth % KernelFormat::kDepth == 0);
    115 
    116     // No L1 blocking in the columns dimension at the moment.
    117     // Thought not to be needed. Similar to Eigen.
    118     l1_cols = cols;
    119 
    120     {
    121       int max_cache_friendly_l1_depth = std::max(
    122           1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
    123                  (KernelFormat::kRows + KernelFormat::kCols));
    124       int min_l1_depth_blocks =
    125           std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth));
    126       l1_depth =
    127           RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks));
    128     }
    129 
    130     {
    131       int max_cache_friendly_l1_rows =
    132           std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
    133       int min_l1_rows_blocks =
    134           std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows));
    135       l1_rows =
    136           RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
    137     }
    138 
    139     *out_l1_rows = l1_rows;
    140     *out_l1_cols = l1_cols;
    141     *out_l1_depth = l1_depth;
    142   }
    143 };
    144 
    145 // A SideBlockParams instance contains only the block params relevant to
    146 // one side (LHS or RHS), expressed in terms of 'width' instead of
    147 // rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
    148 // the number of rows, while in the RHS, 'width' means the number of columns.
    149 // That allows us to write generic code that applies to either LHS or RHS.
    150 struct SideBlockParams {
    151   // L1 block parameters determine the size of small blocks that should
    152   // fit in L1 cache.
    153   int l1_width;
    154   int l1_depth;
    155 
    156   // L2 block parameters determine the size of larger blocks that should
    157   // fit in L2 cache.
    158   int l2_width;
    159   int l2_depth;
    160 };
    161 
    162 enum class Side { Lhs, Rhs };
    163 
    164 inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
    165                                const BlockParams& block_params) {
    166   side_block_params->l1_width =
    167       side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
    168   side_block_params->l2_width =
    169       side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
    170 
    171   side_block_params->l1_depth = block_params.l1_depth;
    172   side_block_params->l2_depth = block_params.l2_depth;
    173 }
    174 
    175 }  // namespace gemmlowp
    176 
    177 #endif  // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
    178